From 1d61c6490e8b68277ff7a9ea9a008f258b4d1fe8 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Wed, 29 Sep 2021 11:37:29 -0400
Subject: [PATCH 1/4] pull actuals without evalcast, a la evaluate_chu

---
 Report/create_reports.R |   4 +-
 Report/score.R          | 111 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/Report/create_reports.R b/Report/create_reports.R
index 5ae5f37..9e8bb39 100644
--- a/Report/create_reports.R
+++ b/Report/create_reports.R
@@ -113,7 +113,9 @@ rm(predictions_cards)
 gc()
 
 print("Evaluating state forecasts")
-state_scores <- evaluate_covid_predictions(state_predictions,
+state_scores <- evaluate_covidcast(
+  state_predictions, 
+  signals, 
   err_measures,
   geo_type = "state"
 )
diff --git a/Report/score.R b/Report/score.R
index 8b057fe..3fea42b 100644
--- a/Report/score.R
+++ b/Report/score.R
@@ -46,6 +46,7 @@ save_score_cards <- function(score_card, geo_type = c("state", "nation"),
   )
 }
 
+# Fetch national truth data from CovidHubUtils.
 evaluate_chu <- function(predictions, signals, err_measures) {
   allowed_signals <- c(
     "confirmed_incidence_num",
@@ -69,7 +70,7 @@ evaluate_chu <- function(predictions, signals, err_measures) {
     "deaths_incidence_num" = "JHU",
     "confirmed_admissions_covid_1d" = "HealthData"
   )
-  scores <- c()
+  scores <- list()
   for (signal_name in signals) {
     preds_signal <- predictions %>%
       filter(signal == signal_name)
@@ -87,6 +88,8 @@ evaluate_chu <- function(predictions, signals, err_measures) {
         geo_type,
         abbreviation
       ))
+    ## select equivalent to
+    # select(target_end_date, actual, geo_value, full_location_name)
     signal_scores <- evaluate_predictions(preds_signal,
       truth_data = chu_truth,
       err_measures,
@@ -97,7 +100,111 @@ evaluate_chu <- function(predictions, signals, err_measures) {
         "forecaster"
       )
     )
-    scores <- rbind(scores, signal_scores)
+    scores[[signal_name]] <- signal_scores
   }
+  return(bind_rows(scores))
+}
+
+# Fetch truth data from COVIDcast. This function bypasses some of the slow parts
+# of the `evalcast` pipeline by pulling all data from COVIDcast together.
+evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
+  allowed_signals <- c(
+    "confirmed_incidence_num",
+    "deaths_incidence_num",
+    "confirmed_admissions_covid_1d"
+  )
+  assert_that(all(signals %in% allowed_signals),
+              msg = paste(
+                "Signal not allowed:",
+                setdiff(signals, allowed_signals)
+              )
+  )
+  
+  source_map <- list(
+    "confirmed_incidence_num" = "jhu-csse",
+    "deaths_incidence_num" = "jhu-csse",
+    "confirmed_admissions_covid_1d" = "hhs"
+  )
+  scores <- list()
+  for (signal_name in signals) {
+    preds_signal <- predictions %>%
+      filter(signal == signal_name)
+    source <- source_map[[signal_name]]
+    covidcast_truth <- get_covidcast_period_actuals(preds_signal)
+    signal_scores <- evaluate_predictions(preds_signal,
+                                          truth_data = covidcast_truth,
+                                          err_measures,
+                                          grp_vars = c(
+                                            "target_end_date",
+                                            "geo_value",
+                                            "ahead",
+                                            "forecaster"
+                                          )
+    )
+    scores[[signal_name]] <- signal_scores
+  }
+  
+  scores <- bind_rows(scores) %>%
+    arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% 
+    select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything)
   return(scores)
 }
+
+
+get_covidcast_period_actuals <- function(response) {
+  # Get start/end dates of each period we want to sum truth values over.
+  target_periods <- response %>%
+    select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% 
+    distinct() %>% 
+    purrr::pmap_dfr(get_target_period) %>% 
+    distinct()
+
+  # Compute the actual values that the forecaster is trying to
+  # predict. In particular,
+  # - get most recent data available from covidcast for these target periods
+  # - sum up the response over the target incidence period
+  target_periods <- target_periods %>%
+    mutate(available = .data$end <= Sys.Date()) %>%
+    filter(.data$available) %>% 
+    select(-.data$available)
+  
+  covidcast_truth <- covidcast::covidcast_signal(
+    source,
+    signal_name,
+    geo_type = geo_type,
+    start_day = as.Date(min(target_periods$start)),
+    end_day = as.Date(max(target_periods$end))
+  ) %>% 
+    select(data_source, signal, geo_value, time_value, value)
+  
+  # Expand out each period by day so easier to join on.
+  target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) {
+    tibble(start=start_date,
+           target_end_date = end_date,
+           day = seq.Date(from=start_date, to=end_date, by = 1)
+    )
+  })
+  
+  period_truth <- full_join(covidcast_truth, target_periods, by=c("time_value"="day"))
+  
+  check_count <- period_truth %>%
+    group_by(.data$geo_value, .data$start, .data$target_end_date) %>%
+    summarize(num = n(), .groups="drop") %>% 
+    filter(num < 7)
+  
+  if (nrow(check_count) != 0) {
+    warning(paste0("Some or all data missing for the following target periods: ",
+                   paste(
+                     paste(period_truth$start, period_truth$target_end_date, sep="-"),
+                     collapse = ", "),
+                   ".")
+    )
+  }
+  
+  period_truth <- period_truth %>%
+    group_by(.data$geo_value, .data$target_end_date) %>%
+    summarize(actual = sum(.data$value), .groups="drop") %>% 
+    select(.data$target_end_date, .data$actual, .data$geo_value)
+  
+  return(period_truth)
+}

From 49ec2b42ff802c80594a968b588ccac761e4d01e Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Fri, 18 Feb 2022 18:38:54 -0500
Subject: [PATCH 2/4] styler

---
 Report/create_reports.R |  4 +-
 Report/score.R          | 81 +++++++++++++++++++++--------------------
 2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/Report/create_reports.R b/Report/create_reports.R
index 0e01cc8..af8239f 100644
--- a/Report/create_reports.R
+++ b/Report/create_reports.R
@@ -133,8 +133,8 @@ for (signal_name in signals) {
 print("Evaluating state forecasts")
 geo_type <- "state"
 state_scores <- evaluate_covidcast(
-  state_predictions, 
-  signals, 
+  state_predictions,
+  signals,
   err_measures,
   geo_type = geo_type
 )
diff --git a/Report/score.R b/Report/score.R
index 3ce6e95..973e017 100644
--- a/Report/score.R
+++ b/Report/score.R
@@ -130,12 +130,12 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
     "confirmed_admissions_covid_1d"
   )
   assert_that(all(signals %in% allowed_signals),
-              msg = paste(
-                "Signal not allowed:",
-                setdiff(signals, allowed_signals)
-              )
+    msg = paste(
+      "Signal not allowed:",
+      setdiff(signals, allowed_signals)
+    )
   )
-  
+
   source_map <- list(
     "confirmed_incidence_num" = "jhu-csse",
     "deaths_incidence_num" = "jhu-csse",
@@ -148,20 +148,20 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
     source <- source_map[[signal_name]]
     covidcast_truth <- get_covidcast_period_actuals(preds_signal)
     signal_scores <- evaluate_predictions(preds_signal,
-                                          truth_data = covidcast_truth,
-                                          err_measures,
-                                          grp_vars = c(
-                                            "target_end_date",
-                                            "geo_value",
-                                            "ahead",
-                                            "forecaster"
-                                          )
+      truth_data = covidcast_truth,
+      err_measures,
+      grp_vars = c(
+        "target_end_date",
+        "geo_value",
+        "ahead",
+        "forecaster"
+      )
     )
     scores[[signal_name]] <- signal_scores
   }
-  
+
   scores <- bind_rows(scores) %>%
-    arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% 
+    arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>%
     select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything)
   return(scores)
 }
@@ -170,9 +170,9 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
 get_covidcast_period_actuals <- function(response) {
   # Get start/end dates of each period we want to sum truth values over.
   target_periods <- response %>%
-    select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% 
-    distinct() %>% 
-    purrr::pmap_dfr(get_target_period) %>% 
+    select(.data$forecast_date, .data$incidence_period, .data$ahead) %>%
+    distinct() %>%
+    purrr::pmap_dfr(get_target_period) %>%
     distinct()
 
   # Compute the actual values that the forecaster is trying to
@@ -181,46 +181,49 @@ get_covidcast_period_actuals <- function(response) {
   # - sum up the response over the target incidence period
   target_periods <- target_periods %>%
     mutate(available = .data$end <= Sys.Date()) %>%
-    filter(.data$available) %>% 
+    filter(.data$available) %>%
     select(-.data$available)
-  
+
   covidcast_truth <- covidcast::covidcast_signal(
     source,
     signal_name,
     geo_type = geo_type,
     start_day = as.Date(min(target_periods$start)),
     end_day = as.Date(max(target_periods$end))
-  ) %>% 
+  ) %>%
     select(data_source, signal, geo_value, time_value, value)
-  
+
   # Expand out each period by day so easier to join on.
   target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) {
-    tibble(start=start_date,
-           target_end_date = end_date,
-           day = seq.Date(from=start_date, to=end_date, by = 1)
+    tibble(
+      start = start_date,
+      target_end_date = end_date,
+      day = seq.Date(from = start_date, to = end_date, by = 1)
     )
   })
-  
-  period_truth <- full_join(covidcast_truth, target_periods, by=c("time_value"="day"))
-  
+
+  period_truth <- full_join(covidcast_truth, target_periods, by = c("time_value" = "day"))
+
   check_count <- period_truth %>%
     group_by(.data$geo_value, .data$start, .data$target_end_date) %>%
-    summarize(num = n(), .groups="drop") %>% 
+    summarize(num = n(), .groups = "drop") %>%
     filter(num < 7)
-  
+
   if (nrow(check_count) != 0) {
-    warning(paste0("Some or all data missing for the following target periods: ",
-                   paste(
-                     paste(period_truth$start, period_truth$target_end_date, sep="-"),
-                     collapse = ", "),
-                   ".")
-    )
+    warning(paste0(
+      "Some or all data missing for the following target periods: ",
+      paste(
+        paste(period_truth$start, period_truth$target_end_date, sep = "-"),
+        collapse = ", "
+      ),
+      "."
+    ))
   }
-  
+
   period_truth <- period_truth %>%
     group_by(.data$geo_value, .data$target_end_date) %>%
-    summarize(actual = sum(.data$value), .groups="drop") %>% 
+    summarize(actual = sum(.data$value), .groups = "drop") %>%
     select(.data$target_end_date, .data$actual, .data$geo_value)
-  
+
   return(period_truth)
 }

From ca1dd260a0530e98645257c0e118a7e83dfc667e Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Thu, 14 Apr 2022 12:51:53 -0400
Subject: [PATCH 3/4] pass meta args to truth data-fetcher

---
 Report/score.R | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/Report/score.R b/Report/score.R
index 435a27e..ea4b226 100644
--- a/Report/score.R
+++ b/Report/score.R
@@ -135,7 +135,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
       setdiff(signals, allowed_signals)
     )
   )
-
+  
   source_map <- list(
     "confirmed_incidence_num" = "jhu-csse",
     "deaths_incidence_num" = "jhu-csse",
@@ -146,7 +146,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
     preds_signal <- predictions %>%
       filter(signal == signal_name)
     source <- source_map[[signal_name]]
-    covidcast_truth <- get_covidcast_period_actuals(preds_signal)
+    covidcast_truth <- get_covidcast_period_actuals(preds_signal, source, signal_name, geo_type)
     signal_scores <- evaluate_predictions(preds_signal,
       truth_data = covidcast_truth,
       err_measures,
@@ -159,15 +159,15 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
     )
     scores[[signal_name]] <- signal_scores
   }
-
+  
   scores <- bind_rows(scores) %>%
     arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>%
-    select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything)
+    select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything())
   return(scores)
 }
 
 
-get_covidcast_period_actuals <- function(response) {
+get_covidcast_period_actuals <- function(response, source, signal_name, geo_type) {
   # Get start/end dates of each period we want to sum truth values over.
   target_periods <- response %>%
     select(.data$forecast_date, .data$incidence_period, .data$ahead) %>%
@@ -194,7 +194,7 @@ get_covidcast_period_actuals <- function(response) {
     select(data_source, signal, geo_value, time_value, value)
 
   # Expand out each period by day so easier to join on.
-  target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) {
+  target_periods <- target_periods %>% purrr::pmap_dfr(function(start_date, end_date) {
     tibble(
       start = start_date,
       target_end_date = end_date,
@@ -204,20 +204,23 @@ get_covidcast_period_actuals <- function(response) {
 
   period_truth <- full_join(covidcast_truth, target_periods, by = c("time_value" = "day"))
 
-  check_count <- period_truth %>%
-    group_by(.data$geo_value, .data$start, .data$target_end_date) %>%
-    summarize(num = n(), .groups = "drop") %>%
-    filter(num < 7)
-
-  if (nrow(check_count) != 0) {
-    warning(paste0(
-      "Some or all data missing for the following target periods: ",
-      paste(
-        paste(period_truth$start, period_truth$target_end_date, sep = "-"),
-        collapse = ", "
-      ),
-      "."
-    ))
+  if (signal_name != "confirmed_admissions_covid_1d") {
+    # For deaths and cases, expect each truth data period to cover a week
+    check_count <- period_truth %>%
+      group_by(.data$geo_value, .data$start, .data$target_end_date) %>%
+      summarize(num = n(), .groups = "drop") %>%
+      filter(num < 7)
+    
+    if (nrow(check_count) != 0) {
+      warning(paste0(
+        "Some or all data missing for the following target periods: ",
+        paste(
+          paste(period_truth$start, period_truth$target_end_date, sep = "-"),
+          collapse = ", "
+        ),
+        "."
+      ))
+    }
   }
 
   period_truth <- period_truth %>%

From 6c6857be1f899f2777329349ca0d466222d693e6 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Thu, 14 Apr 2022 13:00:39 -0400
Subject: [PATCH 4/4] styler

---
 Report/score.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Report/score.R b/Report/score.R
index ea4b226..942ff1f 100644
--- a/Report/score.R
+++ b/Report/score.R
@@ -135,7 +135,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
       setdiff(signals, allowed_signals)
     )
   )
-  
+
   source_map <- list(
     "confirmed_incidence_num" = "jhu-csse",
     "deaths_incidence_num" = "jhu-csse",
@@ -159,7 +159,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) {
     )
     scores[[signal_name]] <- signal_scores
   }
-  
+
   scores <- bind_rows(scores) %>%
     arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>%
     select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything())
@@ -210,7 +210,7 @@ get_covidcast_period_actuals <- function(response, source, signal_name, geo_type
       group_by(.data$geo_value, .data$start, .data$target_end_date) %>%
       summarize(num = n(), .groups = "drop") %>%
       filter(num < 7)
-    
+
     if (nrow(check_count) != 0) {
       warning(paste0(
         "Some or all data missing for the following target periods: ",