Skip to content

Commit 2df8d63

Browse files
authored
Merge pull request #538 from cmu-delphi/sgratzl/meta_3
add related signals to /covidcast/meta
2 parents 77eb17d + 641d0e5 commit 2df8d63

File tree

3 files changed

+183
-13
lines changed

3 files changed

+183
-13
lines changed

src/server/endpoints/covidcast.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Union, Tuple, Dict, Any
1+
from typing import List, Optional, Union, Tuple, Dict, Any, Set
22
from itertools import groupby
33
from datetime import date, datetime
44
from flask import Blueprint, request
@@ -32,7 +32,7 @@
3232
require_any,
3333
)
3434
from .._pandas import as_pandas
35-
from .covidcast_utils import compute_trend, compute_trends, compute_correlations, compute_trend_value, CovidcastMetaEntry
35+
from .covidcast_utils import compute_trend, compute_trends, compute_correlations, compute_trend_value, CovidcastMetaEntry, AllSignalsMap
3636
from ..utils import shift_time_value, date_to_time_value, time_value_to_iso
3737

3838
# first argument is the endpoint name
@@ -467,13 +467,22 @@ def handle_meta():
467467

468468
data = loads(row["epidata"]) if row and row["epidata"] else []
469469

470+
all_signals: AllSignalsMap = {}
471+
for row in data:
472+
if row["time_type"] != "day":
473+
continue
474+
entry: Set[str] = all_signals.setdefault(row["data_source"], set())
475+
entry.add(row["signal"])
476+
470477
out: Dict[str, CovidcastMetaEntry] = {}
471478
for row in data:
472479
if row["time_type"] != "day":
473480
continue
474481
if signal and all((not s.matches(row["data_source"], row["signal"]) for s in signal)):
475482
continue
476-
entry = out.setdefault(f"{row['data_source']}:{row['signal']}", CovidcastMetaEntry(row["data_source"], row["signal"], row["min_time"], row["max_time"], row["max_issue"], {}))
483+
entry = out.setdefault(
484+
f"{row['data_source']}:{row['signal']}", CovidcastMetaEntry(row["data_source"], row["signal"], row["min_time"], row["max_time"], row["max_issue"], {}, all_signals=all_signals)
485+
)
477486
entry.intergrate(row)
478487

479488
return jsonify([r.asdict() for r in out.values()])
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from .trend import compute_trend, compute_trend_value, compute_trends
22
from .correlation import compute_correlations
3-
from .meta import CovidcastMetaEntry
3+
from .meta import CovidcastMetaEntry, AllSignalsMap

src/server/endpoints/covidcast_utils/meta.py

Lines changed: 170 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from dataclasses import dataclass, asdict, field
2-
from typing import Dict, Any
1+
from dataclasses import InitVar, dataclass, asdict, field
2+
from typing import Dict, Any, List, Set
33
from enum import Enum
44

55

@@ -13,6 +13,7 @@ class SignalFormat(str, Enum):
1313
per100k = "per100k"
1414
percent = "percent"
1515
fraction = "fraction"
16+
raw_count = "raw_count"
1617
raw = "raw"
1718

1819

@@ -23,8 +24,11 @@ class SignalCategory(str, Enum):
2324
other = "other"
2425

2526

26-
def guess_name(source: str, signal: str) -> str:
27-
return f"{source.upper()}: {' '.join((s.capitalize() for s in signal.split('_')))}"
27+
def guess_name(source: str, signal: str, is_weighted: bool) -> str:
28+
clean_signal = signal
29+
if is_weighted and source == "fb-survey":
30+
clean_signal = signal.replace("smoothed_w", "smoothed_weighted_").replace("raw_w", "raw_weighted_")
31+
return " ".join((s.capitalize() for s in clean_signal.split("_"))).replace(" Ili", " ILI").replace(" Cli", " CLI").replace("Dont", "Do Not")
2832

2933

3034
def guess_high_values_are(source: str, signal: str) -> HighValuesAre:
@@ -55,10 +59,16 @@ def guess_high_values_are(source: str, signal: str) -> HighValuesAre:
5559
def guess_format(source: str, signal: str) -> SignalFormat:
5660
if source in ["fb-survey", "quidel", "hospital-admissions"]:
5761
return SignalFormat.percent
58-
if source == "safegraph" and signal.endswith("_prop"):
62+
if source == "safegraph" and (signal.endswith("_prop") or signal.endswith("_prop_7dav")):
5963
return SignalFormat.per100k
60-
if source == "indicator-combination" and signal.endswith("_prop"):
64+
if source in ["indicator-combination", "usa-facts", "jhu-csse"] and signal.endswith("_prop"):
6165
return SignalFormat.per100k
66+
if source in ["indicator-combination", "usa-facts", "jhu-csse"] and signal.endswith("_num"):
67+
return SignalFormat.raw_count
68+
if source == "covid-act-now" and signal == "pcr_specimen_positivity_rate":
69+
return SignalFormat.fraction
70+
if source == "covid-act-now" and signal == "pcr_specimen_total_tests":
71+
return SignalFormat.raw_count
6272
return SignalFormat.raw
6373

6474

@@ -67,11 +77,41 @@ def guess_category(source: str, signal: str) -> SignalCategory:
6777
return SignalCategory.early
6878
if source in ["fb-survey", "safegraph", "google-symptoms"]:
6979
return SignalCategory.public
70-
if source in ["quidel", "hospital-admissions", "indicator-combination"]:
80+
if source in ["quidel", "hospital-admissions", "indicator-combination", "usa-facts", "jhu-csse", "hhs", "chng"]:
7181
return SignalCategory.late
7282
return SignalCategory.other
7383

7484

85+
def guess_is_smoothed(signal: str) -> bool:
86+
return "smoothed_" in signal or "7dav" in signal
87+
88+
89+
def guess_is_cumulative(signal: str) -> bool:
90+
return "cumulative_" in signal
91+
92+
93+
def guess_is_weighted(source: str, signal: str) -> bool:
94+
if source == "fb-survey" and signal.startswith("smoothed_w"):
95+
rest = signal[len("smoothed_") :]
96+
if rest.startswith("wanted") or rest.startswith("wearing") or rest.startswith("work") or rest.startswith("worried"):
97+
# it is smoothed_wanted but the weighted one is smoothed_wwanted
98+
return False
99+
return True
100+
if source == "fb-survey" and signal.startswith("raw_w"):
101+
return True
102+
if source == "chng" and signal.startswith("smoothed_adj_"):
103+
return True
104+
return False
105+
106+
107+
def guess_has_stderr(source: str) -> bool:
108+
return source in ["fb-survey", "quidel"]
109+
110+
111+
def guess_has_sample_size(source: str) -> bool:
112+
return source in ["fb-survey", "quidel"]
113+
114+
75115
@dataclass
76116
class CovidcastMetaStats:
77117
min: float
@@ -80,6 +120,112 @@ class CovidcastMetaStats:
80120
max: float
81121

82122

123+
AllSignalsMap = Dict[str, Set[str]]
124+
125+
126+
def guess_related_fb_survey_like(entry: "CovidcastMetaEntry", weighted_infix: str = "w") -> Set[str]:
127+
# compute the plain smoothed version and go from there
128+
smoothed_version = entry.signal
129+
if entry.is_weighted:
130+
# guess the smoothed unweighted version
131+
smoothed_version = entry.signal.replace("smoothed_" + weighted_infix, "smoothed_").replace("raw_" + weighted_infix, "smoothed_")
132+
elif not entry.is_smoothed:
133+
smoothed_version = entry.signal.replace("raw_", "smoothed_")
134+
135+
related: Set[str] = set()
136+
related.add(smoothed_version)
137+
138+
weighted_smoothed_signal = smoothed_version.replace("smoothed_", "smoothed_" + weighted_infix)
139+
related.add(weighted_smoothed_signal)
140+
141+
raw_signal = smoothed_version.replace("smoothed_", "raw_")
142+
related.add(raw_signal)
143+
144+
weighted_raw_signal = smoothed_version.replace("smoothed_", "raw_" + weighted_infix)
145+
related.add(weighted_raw_signal)
146+
147+
return related
148+
149+
150+
def guess_related_cases_death_like(entry: "CovidcastMetaEntry") -> Set[str]:
151+
if entry.is_weighted:
152+
return set() # cannot handle
153+
154+
base_prefix = entry.signal[0 : entry.signal.index("_")]
155+
156+
related: Set[str] = set()
157+
158+
for format in [SignalFormat.raw_count, SignalFormat.per100k]:
159+
suffix = "num" if format == SignalFormat.raw_count else "prop"
160+
incidence_count = f"{base_prefix}_incidence_{suffix}"
161+
related.add(incidence_count)
162+
incidence_cumulative_count = f"{base_prefix}_cumulative_{suffix}"
163+
related.add(incidence_cumulative_count)
164+
165+
smoothed_incidence_count = f"{base_prefix}_7dav_incidence_{suffix}"
166+
related.add(smoothed_incidence_count)
167+
smoothed_incidence_cumulative_count = f"{base_prefix}_7dav_cumulative_{suffix}"
168+
related.add(smoothed_incidence_cumulative_count)
169+
170+
return related
171+
172+
173+
def guess_related_safegraph(entry: "CovidcastMetaEntry") -> Set[str]:
174+
if entry.is_weighted:
175+
return set() # cannot handle
176+
177+
if entry.signal.startswith("median_home_dwell_time"):
178+
return {"median_home_dwell_time", "median_home_dwell_time_7dav"}
179+
180+
base_prefix = entry.signal.replace("_7dav", "").replace("_prop", "").replace("_num", "")
181+
182+
related: Set[str] = set()
183+
184+
for format in [SignalFormat.raw_count, SignalFormat.per100k]:
185+
suffix = "num" if format == SignalFormat.raw_count else "prop"
186+
incidence_count = f"{base_prefix}_{suffix}"
187+
related.add(incidence_count)
188+
189+
smoothed_incidence_count = f"{base_prefix}_{suffix}_7dav"
190+
related.add(smoothed_incidence_count)
191+
192+
return related
193+
194+
195+
def guess_related_generic(entry: "CovidcastMetaEntry") -> Set[str]:
196+
if entry.is_weighted or entry.is_cumulative:
197+
return set() # don't know
198+
if entry.is_smoothed:
199+
raw_version = entry.signal.replace("smoothed_", "raw_")
200+
return {raw_version}
201+
else:
202+
smoothed_version = entry.signal.replace("raw_", "smoothed_")
203+
return {smoothed_version}
204+
205+
206+
def guess_related_signals(entry: "CovidcastMetaEntry", all_signals: AllSignalsMap) -> List[str]:
207+
if entry.source == "indicator-combination" and entry.signal.startswith("nmf_"):
208+
return []
209+
210+
guesses: Set[str] = set()
211+
if entry.source == "fb-survey":
212+
guesses = guess_related_fb_survey_like(entry, "w")
213+
elif entry.source in ["chng", "doctor-visits", "hospital-admissions"]:
214+
guesses = guess_related_fb_survey_like(entry, "adj_")
215+
elif entry.source == "safegraph":
216+
guesses = guess_related_safegraph(entry)
217+
elif entry.source in ["indicator-combination", "usa-facts", "jhu-csse"]:
218+
guesses = guess_related_cases_death_like(entry)
219+
else:
220+
guesses = guess_related_generic(entry)
221+
222+
# remove oneself
223+
guesses.discard(entry.signal)
224+
# return just valid signals
225+
same_source_signals = all_signals.get(entry.source, set())
226+
return sorted(guesses.intersection(same_source_signals))
227+
228+
83229
@dataclass
84230
class CovidcastMetaEntry:
85231
source: str
@@ -93,13 +239,28 @@ class CovidcastMetaEntry:
93239
high_values_are: HighValuesAre = field(init=False)
94240
format: SignalFormat = field(init=False)
95241
category: SignalCategory = field(init=False)
242+
is_smoothed: bool = field(init=False)
243+
is_weighted: bool = field(init=False)
244+
is_cumulative: bool = field(init=False)
245+
has_stderr: bool = field(init=False)
246+
has_sample_size: bool = field(init=False)
247+
248+
related_signals: List[str] = field(init=False)
249+
250+
all_signals: InitVar[AllSignalsMap]
96251

97-
def __post_init__(self):
252+
def __post_init__(self, all_signals: AllSignalsMap):
98253
# derive fields
99-
self.name = guess_name(self.source, self.signal)
100254
self.high_values_are = guess_high_values_are(self.source, self.signal)
101255
self.format = guess_format(self.source, self.signal)
102256
self.category = guess_category(self.source, self.signal)
257+
self.is_smoothed = guess_is_smoothed(self.signal)
258+
self.is_weighted = guess_is_weighted(self.source, self.signal)
259+
self.is_cumulative = guess_is_cumulative(self.signal)
260+
self.has_stderr = guess_has_stderr(self.source)
261+
self.has_sample_size = guess_has_sample_size(self.source)
262+
self.related_signals = guess_related_signals(self, all_signals)
263+
self.name = guess_name(self.source, self.signal, self.is_weighted)
103264

104265
def intergrate(self, row: Dict[str, Any]):
105266
if row["min_time"] < self.min_time:

0 commit comments

Comments
 (0)