Merge pull request #290 from dsweber2/newDataFormat

nmdefries · web-flow · commit a7bcb033fbd4 · 2023-06-23T15:35:18.000-04:00
docs: add links to forecast-actual format repos
diff --git a/app/assets/about.md b/app/assets/about.md
@@ -127,3 +127,18 @@ stateCases = tryCatch(
 ```
   
   
+
+##### Forecasts with actuals
+
+If you are interested in getting the forecasts paired with the corresponding actual values (if you were e.g. testing different evaluation methods), that can be found in [the Amazon S3 bucket](https://forecast-eval.s3.us-east-2.amazonaws.com/) in 3 zip files.
+These files are static, generated using [the aggregation script](https://raw.githubusercontent.com/cmu-delphi/forecast-eval/main/app/assets/forecastsWithActuals.R), and forecast and actual data available on June 12, 2023. The latest forecast date available for each target signal is
+
+* [cases](https://forecast-eval.s3.us-east-2.amazonaws.com/cases.zip): 2023-02-13
+* [hospitalizations](https://forecast-eval.s3.us-east-2.amazonaws.com/hospitalizations.zip):
+  * 1 week: 2023-06-05
+  * 2 week: 2023-06-05
+  * 3 week: 2023-06-05
+  * 4 week: 2023-06-05
+* [deaths](https://forecast-eval.s3.us-east-2.amazonaws.com/deaths.zip): 2023-03-06
+
+If the S3 bucket is down, these files are also available on [Delphi's file-hosting site](https://www.cmu.edu/delphi-web/forecast-eval-scores).
diff --git a/app/assets/forecastsWithActuals.R b/app/assets/forecastsWithActuals.R
@@ -0,0 +1,98 @@
+library(dplyr)
+library(tidyr)
+library(aws.s3)
+
+Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
+s3bucket <- tryCatch(
+  {
+    get_bucket(bucket = "forecast-eval")
+  },
+  error = function(e) {
+    e
+  }
+)
+
+readbucket <- function(name) {
+  tryCatch(
+    {
+      s3readRDS(object = name, bucket = s3bucket)
+    },
+    error = function(e) {
+      e
+    }
+  )
+}
+
+# Cases, deaths, hosp scores: needed for "actual"s
+cases <- bind_rows(
+  readbucket("score_cards_nation_cases.rds"),
+  readbucket("score_cards_state_cases.rds")
+)
+deaths <- bind_rows(
+  readbucket("score_cards_nation_deaths.rds"),
+  readbucket("score_cards_state_deaths.rds")
+)
+hosp <- bind_rows(
+  readbucket("score_cards_nation_hospitalizations.rds"),
+  readbucket("score_cards_state_hospitalizations.rds")
+)
+
+# The big one: predictions from all forecasters
+pred <- readbucket("predictions_cards.rds")
+
+# Cases
+pred_cases <- pred %>%
+  filter(signal == "confirmed_incidence_num") %>%
+  mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>%
+  pivot_wider(
+    names_from = quantile,
+    values_from = value,
+    names_prefix = "forecast_"
+  )
+
+actual_cases <- cases %>%
+  select(ahead, geo_value, forecaster, forecast_date, target_end_date, actual)
+
+joined_cases <- left_join(pred_cases, actual_cases)
+sum(is.na(actual_cases$actual)) == sum(is.na(joined_cases$actual))
+write.csv(joined_cases, "cases.csv")
+
+# Deaths
+pred_deaths <- pred %>%
+  filter(signal == "deaths_incidence_num") %>%
+  mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>%
+  pivot_wider(
+    names_from = quantile,
+    values_from = value,
+    names_prefix = "forecast_"
+  )
+
+actual_deaths <- deaths %>%
+  select(ahead, geo_value, forecaster, forecast_date, target_end_date, actual)
+
+joined_deaths <- left_join(pred_deaths, actual_deaths)
+sum(is.na(actual_deaths$actual)) == sum(is.na(joined_deaths$actual))
+write.csv(joined_deaths, "deaths.csv")
+
+# Hospitalizations: break up by weeks since we run into memory errors o/w!
+pred_hosp <- actual_hosp <- joined_hosp <- vector(mode = "list", length = 4)
+for (k in 1:4) {
+  cat(k, "... ")
+  days <- (k - 1) * 7 + 1:7
+  pred_hosp[[k]] <- pred %>%
+    filter(signal == "confirmed_admissions_covid_1d", ahead %in% days) %>%
+    mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>%
+    pivot_wider(
+      names_from = quantile,
+      values_from = value,
+      names_prefix = "forecast_"
+    )
+
+  actual_hosp[[k]] <- hosp %>%
+    filter(ahead %in% days) %>%
+    select(ahead, geo_value, forecaster, forecast_date, target_end_date, actual)
+
+  joined_hosp[[k]] <- left_join(pred_hosp[[k]], actual_hosp[[k]])
+  cat(sum(is.na(actual_hosp[[k]]$act)) == sum(is.na(joined_hosp[[k]]$act)))
+  write.csv(joined_hosp[[k]], sprintf("hospitalizations_%iwk.csv", k))
+}