From 1d61c6490e8b68277ff7a9ea9a008f258b4d1fe8 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 29 Sep 2021 11:37:29 -0400 Subject: [PATCH 1/4] pull actuals without evalcast, a la evaluate_chu --- Report/create_reports.R | 4 +- Report/score.R | 111 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/Report/create_reports.R b/Report/create_reports.R index 5ae5f37..9e8bb39 100644 --- a/Report/create_reports.R +++ b/Report/create_reports.R @@ -113,7 +113,9 @@ rm(predictions_cards) gc() print("Evaluating state forecasts") -state_scores <- evaluate_covid_predictions(state_predictions, +state_scores <- evaluate_covidcast( + state_predictions, + signals, err_measures, geo_type = "state" ) diff --git a/Report/score.R b/Report/score.R index 8b057fe..3fea42b 100644 --- a/Report/score.R +++ b/Report/score.R @@ -46,6 +46,7 @@ save_score_cards <- function(score_card, geo_type = c("state", "nation"), ) } +# Fetch national truth data from CovidHubUtils. evaluate_chu <- function(predictions, signals, err_measures) { allowed_signals <- c( "confirmed_incidence_num", @@ -69,7 +70,7 @@ evaluate_chu <- function(predictions, signals, err_measures) { "deaths_incidence_num" = "JHU", "confirmed_admissions_covid_1d" = "HealthData" ) - scores <- c() + scores <- list() for (signal_name in signals) { preds_signal <- predictions %>% filter(signal == signal_name) @@ -87,6 +88,8 @@ evaluate_chu <- function(predictions, signals, err_measures) { geo_type, abbreviation )) + ## select equivalent to + # select(target_end_date, actual, geo_value, full_location_name) signal_scores <- evaluate_predictions(preds_signal, truth_data = chu_truth, err_measures, @@ -97,7 +100,111 @@ evaluate_chu <- function(predictions, signals, err_measures) { "forecaster" ) ) - scores <- rbind(scores, signal_scores) + scores[[signal_name]] <- signal_scores } + return(bind_rows(scores)) +} + +# Fetch truth data from COVIDcast. This function bypasses some of the slow parts +# of the `evalcast` pipeline by pulling all data from COVIDcast together. +evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { + allowed_signals <- c( + "confirmed_incidence_num", + "deaths_incidence_num", + "confirmed_admissions_covid_1d" + ) + assert_that(all(signals %in% allowed_signals), + msg = paste( + "Signal not allowed:", + setdiff(signals, allowed_signals) + ) + ) + + source_map <- list( + "confirmed_incidence_num" = "jhu-csse", + "deaths_incidence_num" = "jhu-csse", + "confirmed_admissions_covid_1d" = "hhs" + ) + scores <- list() + for (signal_name in signals) { + preds_signal <- predictions %>% + filter(signal == signal_name) + source <- source_map[[signal_name]] + covidcast_truth <- get_covidcast_period_actuals(preds_signal) + signal_scores <- evaluate_predictions(preds_signal, + truth_data = covidcast_truth, + err_measures, + grp_vars = c( + "target_end_date", + "geo_value", + "ahead", + "forecaster" + ) + ) + scores[[signal_name]] <- signal_scores + } + + scores <- bind_rows(scores) %>% + arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% + select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything) return(scores) } + + +get_covidcast_period_actuals <- function(response) { + # Get start/end dates of each period we want to sum truth values over. + target_periods <- response %>% + select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% + distinct() %>% + purrr::pmap_dfr(get_target_period) %>% + distinct() + + # Compute the actual values that the forecaster is trying to + # predict. In particular, + # - get most recent data available from covidcast for these target periods + # - sum up the response over the target incidence period + target_periods <- target_periods %>% + mutate(available = .data$end <= Sys.Date()) %>% + filter(.data$available) %>% + select(-.data$available) + + covidcast_truth <- covidcast::covidcast_signal( + source, + signal_name, + geo_type = geo_type, + start_day = as.Date(min(target_periods$start)), + end_day = as.Date(max(target_periods$end)) + ) %>% + select(data_source, signal, geo_value, time_value, value) + + # Expand out each period by day so easier to join on. + target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) { + tibble(start=start_date, + target_end_date = end_date, + day = seq.Date(from=start_date, to=end_date, by = 1) + ) + }) + + period_truth <- full_join(covidcast_truth, target_periods, by=c("time_value"="day")) + + check_count <- period_truth %>% + group_by(.data$geo_value, .data$start, .data$target_end_date) %>% + summarize(num = n(), .groups="drop") %>% + filter(num < 7) + + if (nrow(check_count) != 0) { + warning(paste0("Some or all data missing for the following target periods: ", + paste( + paste(period_truth$start, period_truth$target_end_date, sep="-"), + collapse = ", "), + ".") + ) + } + + period_truth <- period_truth %>% + group_by(.data$geo_value, .data$target_end_date) %>% + summarize(actual = sum(.data$value), .groups="drop") %>% + select(.data$target_end_date, .data$actual, .data$geo_value) + + return(period_truth) +} From 49ec2b42ff802c80594a968b588ccac761e4d01e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 18 Feb 2022 18:38:54 -0500 Subject: [PATCH 2/4] styler --- Report/create_reports.R | 4 +- Report/score.R | 81 +++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/Report/create_reports.R b/Report/create_reports.R index 0e01cc8..af8239f 100644 --- a/Report/create_reports.R +++ b/Report/create_reports.R @@ -133,8 +133,8 @@ for (signal_name in signals) { print("Evaluating state forecasts") geo_type <- "state" state_scores <- evaluate_covidcast( - state_predictions, - signals, + state_predictions, + signals, err_measures, geo_type = geo_type ) diff --git a/Report/score.R b/Report/score.R index 3ce6e95..973e017 100644 --- a/Report/score.R +++ b/Report/score.R @@ -130,12 +130,12 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { "confirmed_admissions_covid_1d" ) assert_that(all(signals %in% allowed_signals), - msg = paste( - "Signal not allowed:", - setdiff(signals, allowed_signals) - ) + msg = paste( + "Signal not allowed:", + setdiff(signals, allowed_signals) + ) ) - + source_map <- list( "confirmed_incidence_num" = "jhu-csse", "deaths_incidence_num" = "jhu-csse", @@ -148,20 +148,20 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { source <- source_map[[signal_name]] covidcast_truth <- get_covidcast_period_actuals(preds_signal) signal_scores <- evaluate_predictions(preds_signal, - truth_data = covidcast_truth, - err_measures, - grp_vars = c( - "target_end_date", - "geo_value", - "ahead", - "forecaster" - ) + truth_data = covidcast_truth, + err_measures, + grp_vars = c( + "target_end_date", + "geo_value", + "ahead", + "forecaster" + ) ) scores[[signal_name]] <- signal_scores } - + scores <- bind_rows(scores) %>% - arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% + arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything) return(scores) } @@ -170,9 +170,9 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { get_covidcast_period_actuals <- function(response) { # Get start/end dates of each period we want to sum truth values over. target_periods <- response %>% - select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% - distinct() %>% - purrr::pmap_dfr(get_target_period) %>% + select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% + distinct() %>% + purrr::pmap_dfr(get_target_period) %>% distinct() # Compute the actual values that the forecaster is trying to @@ -181,46 +181,49 @@ get_covidcast_period_actuals <- function(response) { # - sum up the response over the target incidence period target_periods <- target_periods %>% mutate(available = .data$end <= Sys.Date()) %>% - filter(.data$available) %>% + filter(.data$available) %>% select(-.data$available) - + covidcast_truth <- covidcast::covidcast_signal( source, signal_name, geo_type = geo_type, start_day = as.Date(min(target_periods$start)), end_day = as.Date(max(target_periods$end)) - ) %>% + ) %>% select(data_source, signal, geo_value, time_value, value) - + # Expand out each period by day so easier to join on. target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) { - tibble(start=start_date, - target_end_date = end_date, - day = seq.Date(from=start_date, to=end_date, by = 1) + tibble( + start = start_date, + target_end_date = end_date, + day = seq.Date(from = start_date, to = end_date, by = 1) ) }) - - period_truth <- full_join(covidcast_truth, target_periods, by=c("time_value"="day")) - + + period_truth <- full_join(covidcast_truth, target_periods, by = c("time_value" = "day")) + check_count <- period_truth %>% group_by(.data$geo_value, .data$start, .data$target_end_date) %>% - summarize(num = n(), .groups="drop") %>% + summarize(num = n(), .groups = "drop") %>% filter(num < 7) - + if (nrow(check_count) != 0) { - warning(paste0("Some or all data missing for the following target periods: ", - paste( - paste(period_truth$start, period_truth$target_end_date, sep="-"), - collapse = ", "), - ".") - ) + warning(paste0( + "Some or all data missing for the following target periods: ", + paste( + paste(period_truth$start, period_truth$target_end_date, sep = "-"), + collapse = ", " + ), + "." + )) } - + period_truth <- period_truth %>% group_by(.data$geo_value, .data$target_end_date) %>% - summarize(actual = sum(.data$value), .groups="drop") %>% + summarize(actual = sum(.data$value), .groups = "drop") %>% select(.data$target_end_date, .data$actual, .data$geo_value) - + return(period_truth) } From ca1dd260a0530e98645257c0e118a7e83dfc667e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 14 Apr 2022 12:51:53 -0400 Subject: [PATCH 3/4] pass meta args to truth data-fetcher --- Report/score.R | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/Report/score.R b/Report/score.R index 435a27e..ea4b226 100644 --- a/Report/score.R +++ b/Report/score.R @@ -135,7 +135,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { setdiff(signals, allowed_signals) ) ) - + source_map <- list( "confirmed_incidence_num" = "jhu-csse", "deaths_incidence_num" = "jhu-csse", @@ -146,7 +146,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { preds_signal <- predictions %>% filter(signal == signal_name) source <- source_map[[signal_name]] - covidcast_truth <- get_covidcast_period_actuals(preds_signal) + covidcast_truth <- get_covidcast_period_actuals(preds_signal, source, signal_name, geo_type) signal_scores <- evaluate_predictions(preds_signal, truth_data = covidcast_truth, err_measures, @@ -159,15 +159,15 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { ) scores[[signal_name]] <- signal_scores } - + scores <- bind_rows(scores) %>% arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% - select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything) + select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything()) return(scores) } -get_covidcast_period_actuals <- function(response) { +get_covidcast_period_actuals <- function(response, source, signal_name, geo_type) { # Get start/end dates of each period we want to sum truth values over. target_periods <- response %>% select(.data$forecast_date, .data$incidence_period, .data$ahead) %>% @@ -194,7 +194,7 @@ get_covidcast_period_actuals <- function(response) { select(data_source, signal, geo_value, time_value, value) # Expand out each period by day so easier to join on. - target_periods <- target_periods %>% pmap_dfr(function(start_date, end_date) { + target_periods <- target_periods %>% purrr::pmap_dfr(function(start_date, end_date) { tibble( start = start_date, target_end_date = end_date, @@ -204,20 +204,23 @@ get_covidcast_period_actuals <- function(response) { period_truth <- full_join(covidcast_truth, target_periods, by = c("time_value" = "day")) - check_count <- period_truth %>% - group_by(.data$geo_value, .data$start, .data$target_end_date) %>% - summarize(num = n(), .groups = "drop") %>% - filter(num < 7) - - if (nrow(check_count) != 0) { - warning(paste0( - "Some or all data missing for the following target periods: ", - paste( - paste(period_truth$start, period_truth$target_end_date, sep = "-"), - collapse = ", " - ), - "." - )) + if (signal_name != "confirmed_admissions_covid_1d") { + # For deaths and cases, expect each truth data period to cover a week + check_count <- period_truth %>% + group_by(.data$geo_value, .data$start, .data$target_end_date) %>% + summarize(num = n(), .groups = "drop") %>% + filter(num < 7) + + if (nrow(check_count) != 0) { + warning(paste0( + "Some or all data missing for the following target periods: ", + paste( + paste(period_truth$start, period_truth$target_end_date, sep = "-"), + collapse = ", " + ), + "." + )) + } } period_truth <- period_truth %>% From 6c6857be1f899f2777329349ca0d466222d693e6 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 14 Apr 2022 13:00:39 -0400 Subject: [PATCH 4/4] styler --- Report/score.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Report/score.R b/Report/score.R index ea4b226..942ff1f 100644 --- a/Report/score.R +++ b/Report/score.R @@ -135,7 +135,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { setdiff(signals, allowed_signals) ) ) - + source_map <- list( "confirmed_incidence_num" = "jhu-csse", "deaths_incidence_num" = "jhu-csse", @@ -159,7 +159,7 @@ evaluate_covidcast <- function(predictions, signals, err_measures, geo_type) { ) scores[[signal_name]] <- signal_scores } - + scores <- bind_rows(scores) %>% arrange(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period) %>% select(ahead, geo_value, forecaster, forecast_date, data_source, signal, target_end_date, incidence_period, everything()) @@ -210,7 +210,7 @@ get_covidcast_period_actuals <- function(response, source, signal_name, geo_type group_by(.data$geo_value, .data$start, .data$target_end_date) %>% summarize(num = n(), .groups = "drop") %>% filter(num < 7) - + if (nrow(check_count) != 0) { warning(paste0( "Some or all data missing for the following target periods: ",