13
13
import pytest
14
14
15
15
from delphi_utils .archive import ArchiveDiffer , GitArchiveDiffer , S3ArchiveDiffer ,\
16
- archiver_from_params
16
+ archiver_from_params , Nans
17
17
18
- CSV_DTYPES = {"geo_id" : str , "val" : float , "se" : float , "sample_size" : float }
18
+ CSV_DTYPES = {
19
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
20
+ "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
21
+ }
19
22
20
23
CSVS_BEFORE = {
21
24
# Common
22
25
"csv0" : pd .DataFrame ({
23
26
"geo_id" : ["1" , "2" , "3" ],
24
27
"val" : [1.000000001 , 2.00000002 , 3.00000003 ],
25
28
"se" : [0.1 , 0.2 , 0.3 ],
26
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
29
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
30
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
31
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
32
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
33
+ }),
27
34
28
35
"csv1" : pd .DataFrame ({
29
36
"geo_id" : ["1" , "2" , "3" ],
30
37
"val" : [1.0 , 2.0 , 3.0 ],
31
38
"se" : [np .nan , 0.20000002 , 0.30000003 ],
32
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
39
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
40
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
41
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
42
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
43
+ }),
33
44
34
45
# Deleted
35
46
"csv2" : pd .DataFrame ({
36
47
"geo_id" : ["1" ],
37
48
"val" : [1.0 ],
38
49
"se" : [0.1 ],
39
- "sample_size" : [10.0 ]}),
50
+ "sample_size" : [10.0 ],
51
+ "missing_val" : [Nans .NOT_MISSING ],
52
+ "missing_se" : [Nans .NOT_MISSING ],
53
+ "missing_sample_size" : [Nans .NOT_MISSING ],
54
+ }),
55
+
56
+ # Common, but updated with missing columns
57
+ "csv4" : pd .DataFrame ({
58
+ "geo_id" : ["1" ],
59
+ "val" : [1.0 ],
60
+ "se" : [0.1 ],
61
+ "sample_size" : [10.0 ]
62
+ }),
40
63
}
41
64
42
65
CSVS_AFTER = {
45
68
"geo_id" : ["1" , "2" , "3" ],
46
69
"val" : [1.0 , 2.0 , 3.0 ],
47
70
"se" : [0.10000001 , 0.20000002 , 0.30000003 ],
48
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
71
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
72
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
73
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
74
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
75
+ }),
49
76
50
77
"csv1" : pd .DataFrame ({
51
78
"geo_id" : ["1" , "2" , "4" ],
52
79
"val" : [1.0 , 2.1 , 4.0 ],
53
80
"se" : [np .nan , 0.21 , np .nan ],
54
- "sample_size" : [10.0 , 21.0 , 40.0 ]}),
81
+ "sample_size" : [10.0 , 21.0 , 40.0 ],
82
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
83
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
84
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
85
+ }),
55
86
56
87
# Added
57
88
"csv3" : pd .DataFrame ({
58
89
"geo_id" : ["2" ],
59
90
"val" : [2.0000002 ],
60
91
"se" : [0.2 ],
61
- "sample_size" : [20.0 ]}),
92
+ "sample_size" : [20.0 ],
93
+ "missing_val" : [Nans .NOT_MISSING ],
94
+ "missing_se" : [Nans .NOT_MISSING ],
95
+ "missing_sample_size" : [Nans .NOT_MISSING ],
96
+ }),
97
+
98
+ # Common, but updated with missing columns
99
+ "csv4" : pd .DataFrame ({
100
+ "geo_id" : ["1" ],
101
+ "val" : [1.0 ],
102
+ "se" : [0.1 ],
103
+ "sample_size" : [10.0 ],
104
+ "missing_val" : [Nans .NOT_MISSING ],
105
+ "missing_se" : [Nans .NOT_MISSING ],
106
+ "missing_sample_size" : [Nans .NOT_MISSING ],
107
+ }),
62
108
}
63
109
64
-
65
110
class TestArchiveDiffer :
66
111
67
112
def test_stubs (self ):
@@ -80,10 +125,14 @@ def test_diff_and_filter_exports(self, tmp_path):
80
125
mkdir (export_dir )
81
126
82
127
csv1_diff = pd .DataFrame ({
83
- "geo_id" : ["2" , "4" ],
84
- "val" : [2.1 , 4.0 ],
85
- "se" : [0.21 , np .nan ],
86
- "sample_size" : [21.0 , 40.0 ]})
128
+ "geo_id" : ["3" , "2" , "4" ],
129
+ "val" : [np .nan , 2.1 , 4.0 ],
130
+ "se" : [np .nan , 0.21 , np .nan ],
131
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
132
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
133
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
134
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
135
+ })
87
136
88
137
arch_diff = ArchiveDiffer (cache_dir , export_dir )
89
138
@@ -106,15 +155,15 @@ def test_diff_and_filter_exports(self, tmp_path):
106
155
# Check return values
107
156
assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
108
157
assert set (common_diffs .keys ()) == {
109
- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" ]}
158
+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" ]}
110
159
assert set (new_files ) == {join (export_dir , "csv3.csv" )}
111
160
assert common_diffs [join (export_dir , "csv0.csv" )] is None
112
161
assert common_diffs [join (export_dir , "csv1.csv" )] == join (
113
162
export_dir , "csv1.csv.diff" )
114
163
115
164
# Check filesystem for actual files
116
165
assert set (listdir (export_dir )) == {
117
- "csv0.csv" , "csv1.csv" , "csv1.csv.diff" , "csv3.csv" }
166
+ "csv0.csv" , "csv1.csv" , "csv1.csv.diff" , "csv3.csv" , "csv4.csv" , "csv4.csv.diff" }
118
167
assert_frame_equal (
119
168
pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
120
169
csv1_diff )
@@ -132,7 +181,7 @@ def test_diff_and_filter_exports(self, tmp_path):
132
181
arch_diff .filter_exports (common_diffs )
133
182
134
183
# Check exports directory just has incremental changes
135
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
184
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
136
185
assert_frame_equal (
137
186
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
138
187
csv1_diff )
@@ -259,12 +308,16 @@ def test_run(self, tmp_path, s3_client):
259
308
assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
260
309
261
310
# Check exports directory just has incremental changes
262
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
311
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
263
312
csv1_diff = pd .DataFrame ({
264
- "geo_id" : ["2" , "4" ],
265
- "val" : [2.1 , 4.0 ],
266
- "se" : [0.21 , np .nan ],
267
- "sample_size" : [21.0 , 40.0 ]})
313
+ "geo_id" : ["3" , "2" , "4" ],
314
+ "val" : [np .nan , 2.1 , 4.0 ],
315
+ "se" : [np .nan , 0.21 , np .nan ],
316
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
317
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
318
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
319
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
320
+ })
268
321
assert_frame_equal (
269
322
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
270
323
csv1_diff )
@@ -346,7 +399,11 @@ def test_diff_exports(self, tmp_path):
346
399
"geo_id" : ["1" , "2" , "3" ],
347
400
"val" : [1.0 , 2.0 , 3.0 ],
348
401
"se" : [0.1 , 0.2 , 0.3 ],
349
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
402
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
403
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
404
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
405
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
406
+ })
350
407
351
408
# Write exact same CSV into cache and export, so no diffs expected
352
409
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -383,7 +440,11 @@ def test_archive_exports(self, tmp_path):
383
440
"geo_id" : ["1" , "2" , "3" ],
384
441
"val" : [1.0 , 2.0 , 3.0 ],
385
442
"se" : [0.1 , 0.2 , 0.3 ],
386
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
443
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
444
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
445
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
446
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
447
+ })
387
448
388
449
# csv1.csv is now a dirty edit in the repo, and to be exported too
389
450
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -460,12 +521,16 @@ def test_run(self, tmp_path):
460
521
original_branch .checkout ()
461
522
462
523
# Check exports directory just has incremental changes
463
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
524
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
464
525
csv1_diff = pd .DataFrame ({
465
- "geo_id" : ["2" , "4" ],
466
- "val" : [2.1 , 4.0 ],
467
- "se" : [0.21 , np .nan ],
468
- "sample_size" : [21.0 , 40.0 ]})
526
+ "geo_id" : ["3" , "2" , "4" ],
527
+ "val" : [np .nan , 2.1 , 4.0 ],
528
+ "se" : [np .nan , 0.21 , np .nan ],
529
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
530
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
531
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
532
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
533
+ })
469
534
assert_frame_equal (
470
535
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
471
536
csv1_diff )
0 commit comments