1
- from dataclasses import dataclass , asdict , field
2
- from typing import Dict , Any
1
+ from dataclasses import InitVar , dataclass , asdict , field
2
+ from typing import Dict , Any , List , Set
3
3
from enum import Enum
4
4
5
5
@@ -13,6 +13,7 @@ class SignalFormat(str, Enum):
13
13
per100k = "per100k"
14
14
percent = "percent"
15
15
fraction = "fraction"
16
+ raw_count = "raw_count"
16
17
raw = "raw"
17
18
18
19
@@ -23,8 +24,11 @@ class SignalCategory(str, Enum):
23
24
other = "other"
24
25
25
26
26
- def guess_name (source : str , signal : str ) -> str :
27
- return f"{ source .upper ()} : { ' ' .join ((s .capitalize () for s in signal .split ('_' )))} "
27
+ def guess_name (source : str , signal : str , is_weighted : bool ) -> str :
28
+ clean_signal = signal
29
+ if is_weighted and source == "fb-survey" :
30
+ clean_signal = signal .replace ("smoothed_w" , "smoothed_weighted_" ).replace ("raw_w" , "raw_weighted_" )
31
+ return " " .join ((s .capitalize () for s in clean_signal .split ("_" ))).replace (" Ili" , " ILI" ).replace (" Cli" , " CLI" ).replace ("Dont" , "Do Not" )
28
32
29
33
30
34
def guess_high_values_are (source : str , signal : str ) -> HighValuesAre :
@@ -55,10 +59,16 @@ def guess_high_values_are(source: str, signal: str) -> HighValuesAre:
55
59
def guess_format (source : str , signal : str ) -> SignalFormat :
56
60
if source in ["fb-survey" , "quidel" , "hospital-admissions" ]:
57
61
return SignalFormat .percent
58
- if source == "safegraph" and signal .endswith ("_prop" ):
62
+ if source == "safegraph" and ( signal .endswith ("_prop" ) or signal . endswith ( "_prop_7dav" ) ):
59
63
return SignalFormat .per100k
60
- if source == "indicator-combination" and signal .endswith ("_prop" ):
64
+ if source in [ "indicator-combination" , "usa-facts" , "jhu-csse" ] and signal .endswith ("_prop" ):
61
65
return SignalFormat .per100k
66
+ if source in ["indicator-combination" , "usa-facts" , "jhu-csse" ] and signal .endswith ("_num" ):
67
+ return SignalFormat .raw_count
68
+ if source == "covid-act-now" and signal == "pcr_specimen_positivity_rate" :
69
+ return SignalFormat .fraction
70
+ if source == "covid-act-now" and signal == "pcr_specimen_total_tests" :
71
+ return SignalFormat .raw_count
62
72
return SignalFormat .raw
63
73
64
74
@@ -67,11 +77,41 @@ def guess_category(source: str, signal: str) -> SignalCategory:
67
77
return SignalCategory .early
68
78
if source in ["fb-survey" , "safegraph" , "google-symptoms" ]:
69
79
return SignalCategory .public
70
- if source in ["quidel" , "hospital-admissions" , "indicator-combination" ]:
80
+ if source in ["quidel" , "hospital-admissions" , "indicator-combination" , "usa-facts" , "jhu-csse" , "hhs" , "chng" ]:
71
81
return SignalCategory .late
72
82
return SignalCategory .other
73
83
74
84
85
+ def guess_is_smoothed (signal : str ) -> bool :
86
+ return "smoothed_" in signal or "7dav" in signal
87
+
88
+
89
+ def guess_is_cumulative (signal : str ) -> bool :
90
+ return "cumulative_" in signal
91
+
92
+
93
+ def guess_is_weighted (source : str , signal : str ) -> bool :
94
+ if source == "fb-survey" and signal .startswith ("smoothed_w" ):
95
+ rest = signal [len ("smoothed_" ) :]
96
+ if rest .startswith ("wanted" ) or rest .startswith ("wearing" ) or rest .startswith ("work" ) or rest .startswith ("worried" ):
97
+ # it is smoothed_wanted but the weighted one is smoothed_wwanted
98
+ return False
99
+ return True
100
+ if source == "fb-survey" and signal .startswith ("raw_w" ):
101
+ return True
102
+ if source == "chng" and signal .startswith ("smoothed_adj_" ):
103
+ return True
104
+ return False
105
+
106
+
107
+ def guess_has_stderr (source : str ) -> bool :
108
+ return source in ["fb-survey" , "quidel" ]
109
+
110
+
111
+ def guess_has_sample_size (source : str ) -> bool :
112
+ return source in ["fb-survey" , "quidel" ]
113
+
114
+
75
115
@dataclass
76
116
class CovidcastMetaStats :
77
117
min : float
@@ -80,6 +120,112 @@ class CovidcastMetaStats:
80
120
max : float
81
121
82
122
123
+ AllSignalsMap = Dict [str , Set [str ]]
124
+
125
+
126
+ def guess_related_fb_survey_like (entry : "CovidcastMetaEntry" , weighted_infix : str = "w" ) -> Set [str ]:
127
+ # compute the plain smoothed version and go from there
128
+ smoothed_version = entry .signal
129
+ if entry .is_weighted :
130
+ # guess the smoothed unweighted version
131
+ smoothed_version = entry .signal .replace ("smoothed_" + weighted_infix , "smoothed_" ).replace ("raw_" + weighted_infix , "smoothed_" )
132
+ elif not entry .is_smoothed :
133
+ smoothed_version = entry .signal .replace ("raw_" , "smoothed_" )
134
+
135
+ related : Set [str ] = set ()
136
+ related .add (smoothed_version )
137
+
138
+ weighted_smoothed_signal = smoothed_version .replace ("smoothed_" , "smoothed_" + weighted_infix )
139
+ related .add (weighted_smoothed_signal )
140
+
141
+ raw_signal = smoothed_version .replace ("smoothed_" , "raw_" )
142
+ related .add (raw_signal )
143
+
144
+ weighted_raw_signal = smoothed_version .replace ("smoothed_" , "raw_" + weighted_infix )
145
+ related .add (weighted_raw_signal )
146
+
147
+ return related
148
+
149
+
150
+ def guess_related_cases_death_like (entry : "CovidcastMetaEntry" ) -> Set [str ]:
151
+ if entry .is_weighted :
152
+ return set () # cannot handle
153
+
154
+ base_prefix = entry .signal [0 : entry .signal .index ("_" )]
155
+
156
+ related : Set [str ] = set ()
157
+
158
+ for format in [SignalFormat .raw_count , SignalFormat .per100k ]:
159
+ suffix = "num" if format == SignalFormat .raw_count else "prop"
160
+ incidence_count = f"{ base_prefix } _incidence_{ suffix } "
161
+ related .add (incidence_count )
162
+ incidence_cumulative_count = f"{ base_prefix } _cumulative_{ suffix } "
163
+ related .add (incidence_cumulative_count )
164
+
165
+ smoothed_incidence_count = f"{ base_prefix } _7dav_incidence_{ suffix } "
166
+ related .add (smoothed_incidence_count )
167
+ smoothed_incidence_cumulative_count = f"{ base_prefix } _7dav_cumulative_{ suffix } "
168
+ related .add (smoothed_incidence_cumulative_count )
169
+
170
+ return related
171
+
172
+
173
+ def guess_related_safegraph (entry : "CovidcastMetaEntry" ) -> Set [str ]:
174
+ if entry .is_weighted :
175
+ return set () # cannot handle
176
+
177
+ if entry .signal .startswith ("median_home_dwell_time" ):
178
+ return {"median_home_dwell_time" , "median_home_dwell_time_7dav" }
179
+
180
+ base_prefix = entry .signal .replace ("_7dav" , "" ).replace ("_prop" , "" ).replace ("_num" , "" )
181
+
182
+ related : Set [str ] = set ()
183
+
184
+ for format in [SignalFormat .raw_count , SignalFormat .per100k ]:
185
+ suffix = "num" if format == SignalFormat .raw_count else "prop"
186
+ incidence_count = f"{ base_prefix } _{ suffix } "
187
+ related .add (incidence_count )
188
+
189
+ smoothed_incidence_count = f"{ base_prefix } _{ suffix } _7dav"
190
+ related .add (smoothed_incidence_count )
191
+
192
+ return related
193
+
194
+
195
+ def guess_related_generic (entry : "CovidcastMetaEntry" ) -> Set [str ]:
196
+ if entry .is_weighted or entry .is_cumulative :
197
+ return set () # don't know
198
+ if entry .is_smoothed :
199
+ raw_version = entry .signal .replace ("smoothed_" , "raw_" )
200
+ return {raw_version }
201
+ else :
202
+ smoothed_version = entry .signal .replace ("raw_" , "smoothed_" )
203
+ return {smoothed_version }
204
+
205
+
206
+ def guess_related_signals (entry : "CovidcastMetaEntry" , all_signals : AllSignalsMap ) -> List [str ]:
207
+ if entry .source == "indicator-combination" and entry .signal .startswith ("nmf_" ):
208
+ return []
209
+
210
+ guesses : Set [str ] = set ()
211
+ if entry .source == "fb-survey" :
212
+ guesses = guess_related_fb_survey_like (entry , "w" )
213
+ elif entry .source in ["chng" , "doctor-visits" , "hospital-admissions" ]:
214
+ guesses = guess_related_fb_survey_like (entry , "adj_" )
215
+ elif entry .source == "safegraph" :
216
+ guesses = guess_related_safegraph (entry )
217
+ elif entry .source in ["indicator-combination" , "usa-facts" , "jhu-csse" ]:
218
+ guesses = guess_related_cases_death_like (entry )
219
+ else :
220
+ guesses = guess_related_generic (entry )
221
+
222
+ # remove oneself
223
+ guesses .discard (entry .signal )
224
+ # return just valid signals
225
+ same_source_signals = all_signals .get (entry .source , set ())
226
+ return sorted (guesses .intersection (same_source_signals ))
227
+
228
+
83
229
@dataclass
84
230
class CovidcastMetaEntry :
85
231
source : str
@@ -93,13 +239,28 @@ class CovidcastMetaEntry:
93
239
high_values_are : HighValuesAre = field (init = False )
94
240
format : SignalFormat = field (init = False )
95
241
category : SignalCategory = field (init = False )
242
+ is_smoothed : bool = field (init = False )
243
+ is_weighted : bool = field (init = False )
244
+ is_cumulative : bool = field (init = False )
245
+ has_stderr : bool = field (init = False )
246
+ has_sample_size : bool = field (init = False )
247
+
248
+ related_signals : List [str ] = field (init = False )
249
+
250
+ all_signals : InitVar [AllSignalsMap ]
96
251
97
- def __post_init__ (self ):
252
+ def __post_init__ (self , all_signals : AllSignalsMap ):
98
253
# derive fields
99
- self .name = guess_name (self .source , self .signal )
100
254
self .high_values_are = guess_high_values_are (self .source , self .signal )
101
255
self .format = guess_format (self .source , self .signal )
102
256
self .category = guess_category (self .source , self .signal )
257
+ self .is_smoothed = guess_is_smoothed (self .signal )
258
+ self .is_weighted = guess_is_weighted (self .source , self .signal )
259
+ self .is_cumulative = guess_is_cumulative (self .signal )
260
+ self .has_stderr = guess_has_stderr (self .source )
261
+ self .has_sample_size = guess_has_sample_size (self .source )
262
+ self .related_signals = guess_related_signals (self , all_signals )
263
+ self .name = guess_name (self .source , self .signal , self .is_weighted )
103
264
104
265
def intergrate (self , row : Dict [str , Any ]):
105
266
if row ["min_time" ] < self .min_time :
0 commit comments