Skip to content

Commit 8557308

Browse files
committed
Update utilities for NAN codes:
* update export utility to export, validate, and test the missing cols * add deletion coding to the archiver, make it expect missing cols, and let it handle comparisons between missing and non-missing CSVs
1 parent 33537f0 commit 8557308

File tree

4 files changed

+235
-39
lines changed

4 files changed

+235
-39
lines changed

_delphi_utils_python/delphi_utils/archive.py

+27-8
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@
4040
from git import Repo
4141
from git.refs.head import Head
4242
import pandas as pd
43+
import numpy as np
4344

4445
from .utils import read_params
4546
from .logger import get_structured_logger
47+
from .nancodes import Nans
4648

4749
Files = List[str]
4850
FileDiffMap = Dict[str, Optional[str]]
@@ -73,8 +75,10 @@ def diff_export_csv(
7375
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
7476
added_df is the pd.DataFrame of added rows from after_csv.
7577
"""
76-
export_csv_dtypes = {"geo_id": str, "val": float,
77-
"se": float, "sample_size": float}
78+
export_csv_dtypes = {
79+
"geo_id": str, "val": float, "se": float, "sample_size": float,
80+
"missing_val": int, "missing_se": int, "missing_sample_size": int
81+
}
7882

7983
before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes)
8084
before_df.set_index("geo_id", inplace=True)
@@ -89,12 +93,27 @@ def diff_export_csv(
8993
before_df_cmn = before_df.reindex(common_idx)
9094
after_df_cmn = after_df.reindex(common_idx)
9195

92-
# Exact comparisons, treating NA == NA as True
93-
same_mask = before_df_cmn == after_df_cmn
94-
same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
96+
# If CSVs have different columns (no missingness), mark all values as new
97+
if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns):
98+
same_mask = after_df_cmn.copy()
99+
same_mask.loc[:] = False
100+
else:
101+
# Exact comparisons, treating NA == NA as True
102+
same_mask = before_df_cmn == after_df_cmn
103+
same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
104+
105+
# Code deleted entries as nans with the deleted missing code
106+
deleted_df = before_df.loc[deleted_idx, :].copy()
107+
deleted_df[["val", "se", "sample_size"]] = np.nan
108+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
109+
110+
# Code deleted entries as nans with the deleted missing code
111+
deleted_df = before_df.loc[deleted_idx, :].copy()
112+
deleted_df[["val", "se", "sample_size"]] = np.nan
113+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
95114

96115
return (
97-
before_df.loc[deleted_idx, :],
116+
deleted_df,
98117
after_df_cmn.loc[~(same_mask.all(axis=1)), :],
99118
after_df.loc[added_idx, :])
100119

@@ -227,11 +246,11 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
227246

228247
deleted_df, changed_df, added_df = diff_export_csv(
229248
before_file, after_file)
230-
new_issues_df = pd.concat([changed_df, added_df], axis=0)
249+
new_issues_df = pd.concat([deleted_df, changed_df, added_df], axis=0)
231250

232251
if len(deleted_df) > 0:
233252
print(
234-
f"Warning, diff has deleted indices in {after_file} that will be ignored")
253+
f"Diff has deleted indices in {after_file} that have been coded as nans.")
235254

236255
# Write the diffs to diff_file, if applicable
237256
if len(new_issues_df) > 0:

_delphi_utils_python/delphi_utils/export.py

+40-2
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,32 @@
33
from datetime import datetime
44
from os.path import join
55
from typing import Optional
6+
import logging
67

78
import numpy as np
89
import pandas as pd
910

11+
from .nancodes import Nans
12+
13+
def filter_contradicting_missing_codes(df, sensor, metric, date, logger=None):
14+
"""Find values with contradictory missingness codes, filter them, and log."""
15+
columns = ["val", "se", "sample_size"]
16+
# Get indicies where the XNOR is true (i.e. both are true or both are false).
17+
masks = [
18+
~(df[column].isna() ^ df["missing_" + column].eq(Nans.NOT_MISSING))
19+
for column in columns
20+
]
21+
for mask in masks:
22+
if not logger is None and df.loc[mask].size > 0:
23+
logger.info(
24+
"Filtering contradictory missing code in " +
25+
"{0}_{1}_{2}.".format(sensor, metric, date.strftime(format="%Y-%m-%d"))
26+
)
27+
df = df.loc[~mask]
28+
elif logger is None and df.loc[mask].size > 0:
29+
df = df.loc[~mask]
30+
return df
31+
1032
def create_export_csv(
1133
df: pd.DataFrame,
1234
export_dir: str,
@@ -15,7 +37,8 @@ def create_export_csv(
1537
metric: Optional[str] = None,
1638
start_date: Optional[datetime] = None,
1739
end_date: Optional[datetime] = None,
18-
remove_null_samples: Optional[bool] = False
40+
remove_null_samples: Optional[bool] = False,
41+
logger: Optional[logging.Logger] = None
1942
):
2043
"""Export data in the format expected by the Delphi API.
2144
@@ -39,6 +62,8 @@ def create_export_csv(
3962
Latest date to export or None if no maximum date restrictions should be applied.
4063
remove_null_samples: Optional[bool]
4164
Whether to remove entries whose sample sizes are null.
65+
logger: Optional[logging.Logger]
66+
Pass a logger object here to log information about contradictory missing codes.
4267
4368
Returns
4469
---------
@@ -64,7 +89,20 @@ def create_export_csv(
6489
else:
6590
export_filename = f"{date.strftime('%Y%m%d')}_{geo_res}_{metric}_{sensor}.csv"
6691
export_file = join(export_dir, export_filename)
67-
export_df = df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size",]]
92+
expected_columns = [
93+
"geo_id",
94+
"val",
95+
"se",
96+
"sample_size",
97+
"missing_val",
98+
"missing_se",
99+
"missing_sample_size"
100+
]
101+
export_df = df[df["timestamp"] == date].filter(items=expected_columns)
102+
if "missing_val" in export_df.columns:
103+
export_df = filter_contradicting_missing_codes(
104+
export_df, sensor, metric, date, logger=logger
105+
)
68106
if remove_null_samples:
69107
export_df = export_df[export_df["sample_size"].notnull()]
70108
export_df = export_df.round({"val": 7, "se": 7})

_delphi_utils_python/tests/test_archive.py

+93-28
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,53 @@
1313
import pytest
1414

1515
from delphi_utils.archive import ArchiveDiffer, GitArchiveDiffer, S3ArchiveDiffer,\
16-
archiver_from_params
16+
archiver_from_params, Nans
1717

18-
CSV_DTYPES = {"geo_id": str, "val": float, "se": float, "sample_size": float}
18+
CSV_DTYPES = {
19+
"geo_id": str, "val": float, "se": float, "sample_size": float,
20+
"missing_val": int, "missing_se":int, "missing_sample_size": int
21+
}
1922

2023
CSVS_BEFORE = {
2124
# Common
2225
"csv0": pd.DataFrame({
2326
"geo_id": ["1", "2", "3"],
2427
"val": [1.000000001, 2.00000002, 3.00000003],
2528
"se": [0.1, 0.2, 0.3],
26-
"sample_size": [10.0, 20.0, 30.0]}),
29+
"sample_size": [10.0, 20.0, 30.0],
30+
"missing_val": [Nans.NOT_MISSING] * 3,
31+
"missing_se": [Nans.NOT_MISSING] * 3,
32+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
33+
}),
2734

2835
"csv1": pd.DataFrame({
2936
"geo_id": ["1", "2", "3"],
3037
"val": [1.0, 2.0, 3.0],
3138
"se": [np.nan, 0.20000002, 0.30000003],
32-
"sample_size": [10.0, 20.0, 30.0]}),
39+
"sample_size": [10.0, 20.0, 30.0],
40+
"missing_val": [Nans.NOT_MISSING] * 3,
41+
"missing_se": [Nans.NOT_MISSING] * 3,
42+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
43+
}),
3344

3445
# Deleted
3546
"csv2": pd.DataFrame({
3647
"geo_id": ["1"],
3748
"val": [1.0],
3849
"se": [0.1],
39-
"sample_size": [10.0]}),
50+
"sample_size": [10.0],
51+
"missing_val": [Nans.NOT_MISSING],
52+
"missing_se": [Nans.NOT_MISSING],
53+
"missing_sample_size": [Nans.NOT_MISSING],
54+
}),
55+
56+
# Common, but updated with missing columns
57+
"csv4": pd.DataFrame({
58+
"geo_id": ["1"],
59+
"val": [1.0],
60+
"se": [0.1],
61+
"sample_size": [10.0]
62+
}),
4063
}
4164

4265
CSVS_AFTER = {
@@ -45,23 +68,45 @@
4568
"geo_id": ["1", "2", "3"],
4669
"val": [1.0, 2.0, 3.0],
4770
"se": [0.10000001, 0.20000002, 0.30000003],
48-
"sample_size": [10.0, 20.0, 30.0]}),
71+
"sample_size": [10.0, 20.0, 30.0],
72+
"missing_val": [Nans.NOT_MISSING] * 3,
73+
"missing_se": [Nans.NOT_MISSING] * 3,
74+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
75+
}),
4976

5077
"csv1": pd.DataFrame({
5178
"geo_id": ["1", "2", "4"],
5279
"val": [1.0, 2.1, 4.0],
5380
"se": [np.nan, 0.21, np.nan],
54-
"sample_size": [10.0, 21.0, 40.0]}),
81+
"sample_size": [10.0, 21.0, 40.0],
82+
"missing_val": [Nans.NOT_MISSING] * 3,
83+
"missing_se": [Nans.NOT_MISSING] * 3,
84+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
85+
}),
5586

5687
# Added
5788
"csv3": pd.DataFrame({
5889
"geo_id": ["2"],
5990
"val": [2.0000002],
6091
"se": [0.2],
61-
"sample_size": [20.0]}),
92+
"sample_size": [20.0],
93+
"missing_val": [Nans.NOT_MISSING],
94+
"missing_se": [Nans.NOT_MISSING],
95+
"missing_sample_size": [Nans.NOT_MISSING],
96+
}),
97+
98+
# Common, but updated with missing columns
99+
"csv4": pd.DataFrame({
100+
"geo_id": ["1"],
101+
"val": [1.0],
102+
"se": [0.1],
103+
"sample_size": [10.0],
104+
"missing_val": [Nans.NOT_MISSING],
105+
"missing_se": [Nans.NOT_MISSING],
106+
"missing_sample_size": [Nans.NOT_MISSING],
107+
}),
62108
}
63109

64-
65110
class TestArchiveDiffer:
66111

67112
def test_stubs(self):
@@ -80,10 +125,14 @@ def test_diff_and_filter_exports(self, tmp_path):
80125
mkdir(export_dir)
81126

82127
csv1_diff = pd.DataFrame({
83-
"geo_id": ["2", "4"],
84-
"val": [2.1, 4.0],
85-
"se": [0.21, np.nan],
86-
"sample_size": [21.0, 40.0]})
128+
"geo_id": ["3", "2", "4"],
129+
"val": [np.nan, 2.1, 4.0],
130+
"se": [np.nan, 0.21, np.nan],
131+
"sample_size": [np.nan, 21.0, 40.0],
132+
"missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
133+
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
134+
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
135+
})
87136

88137
arch_diff = ArchiveDiffer(cache_dir, export_dir)
89138

@@ -106,15 +155,15 @@ def test_diff_and_filter_exports(self, tmp_path):
106155
# Check return values
107156
assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
108157
assert set(common_diffs.keys()) == {
109-
join(export_dir, f) for f in ["csv0.csv", "csv1.csv"]}
158+
join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]}
110159
assert set(new_files) == {join(export_dir, "csv3.csv")}
111160
assert common_diffs[join(export_dir, "csv0.csv")] is None
112161
assert common_diffs[join(export_dir, "csv1.csv")] == join(
113162
export_dir, "csv1.csv.diff")
114163

115164
# Check filesystem for actual files
116165
assert set(listdir(export_dir)) == {
117-
"csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv"}
166+
"csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv", "csv4.csv", "csv4.csv.diff"}
118167
assert_frame_equal(
119168
pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
120169
csv1_diff)
@@ -132,7 +181,7 @@ def test_diff_and_filter_exports(self, tmp_path):
132181
arch_diff.filter_exports(common_diffs)
133182

134183
# Check exports directory just has incremental changes
135-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
184+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
136185
assert_frame_equal(
137186
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
138187
csv1_diff)
@@ -259,12 +308,16 @@ def test_run(self, tmp_path, s3_client):
259308
assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
260309

261310
# Check exports directory just has incremental changes
262-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
311+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
263312
csv1_diff = pd.DataFrame({
264-
"geo_id": ["2", "4"],
265-
"val": [2.1, 4.0],
266-
"se": [0.21, np.nan],
267-
"sample_size": [21.0, 40.0]})
313+
"geo_id": ["3", "2", "4"],
314+
"val": [np.nan, 2.1, 4.0],
315+
"se": [np.nan, 0.21, np.nan],
316+
"sample_size": [np.nan, 21.0, 40.0],
317+
"missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
318+
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
319+
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
320+
})
268321
assert_frame_equal(
269322
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
270323
csv1_diff)
@@ -346,7 +399,11 @@ def test_diff_exports(self, tmp_path):
346399
"geo_id": ["1", "2", "3"],
347400
"val": [1.0, 2.0, 3.0],
348401
"se": [0.1, 0.2, 0.3],
349-
"sample_size": [10.0, 20.0, 30.0]})
402+
"sample_size": [10.0, 20.0, 30.0],
403+
"missing_val": [Nans.NOT_MISSING] * 3,
404+
"missing_se": [Nans.NOT_MISSING] * 3,
405+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
406+
})
350407

351408
# Write exact same CSV into cache and export, so no diffs expected
352409
csv1.to_csv(join(cache_dir, "csv1.csv"), index=False)
@@ -383,7 +440,11 @@ def test_archive_exports(self, tmp_path):
383440
"geo_id": ["1", "2", "3"],
384441
"val": [1.0, 2.0, 3.0],
385442
"se": [0.1, 0.2, 0.3],
386-
"sample_size": [10.0, 20.0, 30.0]})
443+
"sample_size": [10.0, 20.0, 30.0],
444+
"missing_val": [Nans.NOT_MISSING] * 3,
445+
"missing_se": [Nans.NOT_MISSING] * 3,
446+
"missing_sample_size": [Nans.NOT_MISSING] * 3,
447+
})
387448

388449
# csv1.csv is now a dirty edit in the repo, and to be exported too
389450
csv1.to_csv(join(cache_dir, "csv1.csv"), index=False)
@@ -460,12 +521,16 @@ def test_run(self, tmp_path):
460521
original_branch.checkout()
461522

462523
# Check exports directory just has incremental changes
463-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
524+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
464525
csv1_diff = pd.DataFrame({
465-
"geo_id": ["2", "4"],
466-
"val": [2.1, 4.0],
467-
"se": [0.21, np.nan],
468-
"sample_size": [21.0, 40.0]})
526+
"geo_id": ["3", "2", "4"],
527+
"val": [np.nan, 2.1, 4.0],
528+
"se": [np.nan, 0.21, np.nan],
529+
"sample_size": [np.nan, 21.0, 40.0],
530+
"missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
531+
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
532+
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
533+
})
469534
assert_frame_equal(
470535
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
471536
csv1_diff)

0 commit comments

Comments
 (0)