diff --git a/Makefile b/Makefile index 3ed41c6..2becaa4 100644 --- a/Makefile +++ b/Makefile @@ -29,15 +29,15 @@ r_build: docker build --no-cache --force-rm --pull -t forecast-eval-build docker_build # Download the named file from the AWS S3 bucket -%.rds: dist +%.csv.gz: dist test -f dist/$@ || curl -o dist/$@ $(S3_URL)/$@ # Specify all the data files we want to download -pull_data: score_cards_state_deaths.rds score_cards_state_cases.rds score_cards_nation_cases.rds score_cards_nation_deaths.rds score_cards_state_hospitalizations.rds score_cards_nation_hospitalizations.rds datetime_created_utc.rds +pull_data: score_cards_state_deaths.csv.gz score_cards_state_cases.csv.gz score_cards_nation_cases.csv.gz score_cards_nation_deaths.csv.gz score_cards_state_hospitalizations.csv.gz score_cards_nation_hospitalizations.csv.gz datetime_created_utc.csv.gz # Download all the predictions cards objects. This is # useful for development and debugging -pull_pred_cards: predictions_cards_confirmed_admissions_covid_1d.rds predictions_cards_confirmed_incidence_num.rds predictions_cards_deaths_incidence_num.rds +pull_pred_cards: predictions_cards_confirmed_admissions_covid_1d.csv.gz predictions_cards_confirmed_incidence_num.csv.gz predictions_cards_deaths_incidence_num.csv.gz # Create the dist directory dist: @@ -59,7 +59,7 @@ score_forecast: r_build dist pull_data # Post scoring pipeline output files to the AWS S3 bucket deploy: score_forecast - aws s3 cp dist/ $(S3_BUCKET)/ --recursive --exclude "*" --include "*rds" --acl public-read + aws s3 cp dist/ $(S3_BUCKET)/ --recursive --exclude "*" --include "*csv.gz" --acl public-read # Run bash in a docker container with a full preconfigured R environment # diff --git a/Report/create_reports.R b/Report/create_reports.R index 6ccbb0c..cb168d3 100644 --- a/Report/create_reports.R +++ b/Report/create_reports.R @@ -4,6 +4,7 @@ library("dplyr") library("evalcast") library("lubridate") library("stringr") +library("data.table") # TODO: Contains fixed versions of WIS component metrics, to be ported over to evalcast # Redefines overprediction, underprediction and sharpness @@ -23,7 +24,8 @@ option_list <- list( opt_parser <- OptionParser(option_list = option_list) opt <- parse_args(opt_parser) output_dir <- opt$dir -prediction_cards_filename <- "predictions_cards_${signal}.rds" +# Compress since prediction cards obj is big +prediction_cards_filename <- "predictions_cards_${signal}.csv.gz" prediction_cards_filepath <- case_when( !is.null(output_dir) ~ file.path(output_dir, prediction_cards_filename), TRUE ~ prediction_cards_filename @@ -106,17 +108,15 @@ class(predictions_cards) <- c("predictions_cards", class(predictions_cards)) print("Saving predictions...") if (length(signals) == 1) { signal <- signals - saveRDS(predictions_cards, - file = str_interp(prediction_cards_filepath), - compress = "xz" + fwrite(predictions_cards, + file = str_interp(prediction_cards_filepath) ) } else { # Save each signal separately. for (signal_group in group_split(predictions_cards, signal)) { signal <- signal_group$signal[1] - saveRDS(signal_group, + fwrite(signal_group, file = str_interp(prediction_cards_filepath), - compress = "xz" ) } } @@ -228,5 +228,5 @@ if (length(save_score_errors) > 0) { stop(paste(save_score_errors, collapse = "\n")) } -saveRDS(data.frame(datetime = c(data_pull_timestamp)), file = file.path(output_dir, "datetime_created_utc.rds")) +fwrite(data.frame(datetime = c(data_pull_timestamp)), file = file.path(output_dir, "datetime_created_utc.csv.gz")) print("Done") diff --git a/Report/score.R b/Report/score.R index 364677a..2a98eb7 100644 --- a/Report/score.R +++ b/Report/score.R @@ -13,7 +13,7 @@ generate_score_card_file_path <- function(geo_type, signal_name, output_dir) { output_dir, paste0( "score_cards_", geo_type, "_", - sig_suffix, ".rds" + sig_suffix, ".csv.gz" ) ) return(output_file_name) @@ -44,10 +44,7 @@ save_score_cards <- function(score_card, geo_type = c("state", "nation"), } output_file_name <- generate_score_card_file_path(geo_type, signal_name, output_dir) - saveRDS(score_card, - file = output_file_name, - compress = "xz" - ) + fwrite(score_card, file = output_file_name) } save_score_cards_wrapper <- function(score_card, geo_type, signal_name, output_dir) { diff --git a/Report/utils.R b/Report/utils.R index 311bb38..ed09705 100644 --- a/Report/utils.R +++ b/Report/utils.R @@ -1,6 +1,6 @@ check_for_missing_forecasters <- function(predictions_cards, forecasters_list, geo_type, signal_name, output_dir) { output_file_name <- generate_score_card_file_path(geo_type, signal_name, output_dir) - previous_run_forecasters <- readRDS(output_file_name) %>% + previous_run_forecasters <- fread(output_file_name, data.table = FALSE) %>% filter(signal == signal_name) %>% distinct(forecaster) %>% pull() diff --git a/app/R/data.R b/app/R/data.R index 1079939..ed78bbe 100644 --- a/app/R/data.R +++ b/app/R/data.R @@ -1,4 +1,5 @@ library(aws.s3) +library(data.table) # Set application-level caching location. Stores up to 1GB of caches. Removes # least recently used objects first. @@ -33,16 +34,25 @@ getData <- function(filename, s3bucket) { if (!is.null(s3bucket)) { tryCatch( { - aws.s3::s3readRDS(object = filename, bucket = s3bucket) + result <- s3read_using(fread, data.table = FALSE, object = filename, bucket = s3bucket) }, error = function(e) { e - getFallbackData(filename) + result <- getFallbackData(filename) } ) } else { - getFallbackData(filename) + result <- getFallbackData(filename) } + + if (filename != "datetime_created_utc.csv.gz") { + # fread uses the `IDate` class for dates. This causes problems downstream, + # so cast to base `Date`. + result$target_end_date <- as.Date(result$target_end_date) + result$forecast_date <- as.Date(result$forecast_date) + } + + return(result) } createS3DataFactory <- function(s3bucket) { @@ -57,12 +67,12 @@ getFallbackData <- function(filename) { filename, file.path("../dist/", filename) ) - readRDS(path) + fread(path, data.table = FALSE) } getCreationDate <- function(loadFile) { - dataCreationDate <- loadFile("datetime_created_utc.rds") + dataCreationDate <- loadFile("datetime_created_utc.csv.gz") return(dataCreationDate %>% pull(datetime) %>% as.Date()) } @@ -70,16 +80,16 @@ getCreationDate <- function(loadFile) { getAllData <- function(loadFile, targetVariable) { df <- switch(targetVariable, "Deaths" = bind_rows( - loadFile("score_cards_state_deaths.rds"), - loadFile("score_cards_nation_deaths.rds") + loadFile("score_cards_state_deaths.csv.gz"), + loadFile("score_cards_nation_deaths.csv.gz") ), "Cases" = bind_rows( - loadFile("score_cards_state_cases.rds"), - loadFile("score_cards_nation_cases.rds") + loadFile("score_cards_state_cases.csv.gz"), + loadFile("score_cards_nation_cases.csv.gz") ), "Hospitalizations" = bind_rows( - loadFile("score_cards_state_hospitalizations.rds"), - loadFile("score_cards_nation_hospitalizations.rds") + loadFile("score_cards_state_hospitalizations.csv.gz"), + loadFile("score_cards_nation_hospitalizations.csv.gz") ) ) diff --git a/app/assets/about.md b/app/assets/about.md index 53f3766..aeac897 100644 --- a/app/assets/about.md +++ b/app/assets/about.md @@ -91,21 +91,25 @@ The forecasts and scores are available as RDS files and are uploaded weekly to a You can use the url https://forecast-eval.s3.us-east-2.amazonaws.com/ + filename to download any of the files from the bucket. -For instance: https://forecast-eval.s3.us-east-2.amazonaws.com/score_cards_nation_cases.rds to download scores for nation level case predictions. +For instance: https://forecast-eval.s3.us-east-2.amazonaws.com/score_cards_nation_cases.csv.gz to download scores for nation level case predictions. The available files are: -* predictions_cards.rds (forecasts) -* score_cards_nation_cases.rds -* score_cards_nation_deaths.rds -* score_cards_state_cases.rds -* score_cards_state_deaths.rds -* score_cards_state_hospitalizations.rds -* score_cards_nation_hospitalizations.rds +* predictions_cards_confirmed_incidence_num.csv.gz (forecasts for cases) +* predictions_cards_deaths_incidence_num.csv.gz (forecasts for deaths) +* predictions_cards_confirmed_admissions_covid_1d.csv.gz (forecasts for hospitalizations) +* score_cards_nation_cases.csv.gz +* score_cards_nation_deaths.csv.gz +* score_cards_state_cases.csv.gz +* score_cards_state_deaths.csv.gz +* score_cards_state_hospitalizations.csv.gz +* score_cards_nation_hospitalizations.csv.gz You can also connect to AWS and retrieve the data in R. Example of retrieving state cases file: ``` library(aws.s3) +library(data.table) + Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2") s3bucket = tryCatch( { @@ -118,14 +122,20 @@ s3bucket = tryCatch( stateCases = tryCatch( { - s3readRDS(object = "score_cards_state_cases.rds", bucket = s3bucket) + s3read_using(fread, object = "score_cards_state_cases.csv.gz", bucket = s3bucket) }, error = function(e) { e } ) ``` - + +or using the URL of the file: + +``` +library(data.table) +stateCases <- fread("https://forecast-eval.s3.us-east-2.amazonaws.com/score_cards_state_cases.csv.gz") +``` ##### Forecasts with actuals diff --git a/app/assets/forecastsWithActuals.R b/app/assets/forecastsWithActuals.R index 3fb84ce..a9389c5 100644 --- a/app/assets/forecastsWithActuals.R +++ b/app/assets/forecastsWithActuals.R @@ -1,11 +1,12 @@ library(dplyr) library(tidyr) library(aws.s3) +library(data.table) Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2") s3bucket <- tryCatch( { - get_bucket(bucket = "forecast-eval") + aws.s3::get_bucket(bucket = "forecast-eval") }, error = function(e) { e @@ -15,7 +16,7 @@ s3bucket <- tryCatch( readbucket <- function(name) { tryCatch( { - s3readRDS(object = name, bucket = s3bucket) + s3read_using(fread, data.table = FALSE, object = name, bucket = s3bucket) }, error = function(e) { e @@ -25,24 +26,21 @@ readbucket <- function(name) { # Cases, deaths, hosp scores: needed for "actual"s cases <- bind_rows( - readbucket("score_cards_nation_cases.rds"), - readbucket("score_cards_state_cases.rds") + readbucket("score_cards_nation_cases.csv.gz"), + readbucket("score_cards_state_cases.csv.gz") ) deaths <- bind_rows( - readbucket("score_cards_nation_deaths.rds"), - readbucket("score_cards_state_deaths.rds") + readbucket("score_cards_nation_deaths.csv.gz"), + readbucket("score_cards_state_deaths.csv.gz") ) hosp <- bind_rows( - readbucket("score_cards_nation_hospitalizations.rds"), - readbucket("score_cards_state_hospitalizations.rds") + readbucket("score_cards_nation_hospitalizations.csv.gz"), + readbucket("score_cards_state_hospitalizations.csv.gz") ) -# The big one: predictions from all forecasters -pred <- readbucket("predictions_cards.rds") - # Cases +pred <- readbucket("predictions_cards_confirmed_incidence_num.csv.gz") pred_cases <- pred %>% - filter(signal == "confirmed_incidence_num") %>% mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>% pivot_wider( names_from = quantile, @@ -50,6 +48,9 @@ pred_cases <- pred %>% names_prefix = "forecast_" ) +rm(pred) +gc() + actual_cases <- cases %>% select(ahead, geo_value, forecaster, forecast_date, target_end_date, actual) @@ -58,8 +59,8 @@ sum(is.na(actual_cases$actual)) == sum(is.na(joined_cases$actual)) write.csv(joined_cases, "cases.csv") # Deaths +pred <- readbucket("predictions_cards_deaths_incidence_num.csv.gz") pred_deaths <- pred %>% - filter(signal == "deaths_incidence_num") %>% mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>% pivot_wider( names_from = quantile, @@ -67,6 +68,9 @@ pred_deaths <- pred %>% names_prefix = "forecast_" ) +rm(pred) +gc() + actual_deaths <- deaths %>% select(ahead, geo_value, forecaster, forecast_date, target_end_date, actual) @@ -75,12 +79,13 @@ sum(is.na(actual_deaths$actual)) == sum(is.na(joined_deaths$actual)) write.csv(joined_deaths, "deaths.csv") # Hospitalizations: break up by weeks since we run into memory errors o/w! +pred <- readbucket("predictions_cards_confirmed_admissions_covid_1d.csv.gz") pred_hosp <- actual_hosp <- joined_hosp <- vector(mode = "list", length = 4) for (k in 1:4) { cat(k, "... ") days <- (k - 1) * 7 + 1:7 pred_hosp[[k]] <- pred %>% - filter(signal == "confirmed_admissions_covid_1d", ahead %in% days) %>% + filter(ahead %in% days) %>% mutate(signal = NULL, data_source = NULL, incidence_period = NULL) %>% pivot_wider( names_from = quantile, diff --git a/app/server.R b/app/server.R index 1d7b067..53a8328 100644 --- a/app/server.R +++ b/app/server.R @@ -402,6 +402,7 @@ server <- function(input, output, session) { # Fill gaps so there are line breaks on weeks without data # This is failing for CU-select on US deaths (https://github.com/cmu-delphi/forecast-eval/issues/157) filteredScoreDf <- filteredScoreDf %>% + mutate(Week_End_Date = as.Date(Week_End_Date)) %>% as_tsibble(key = c(Forecaster, ahead), index = Week_End_Date) %>% group_by(Forecaster, Forecast_Date, ahead) %>% fill_gaps(.full = TRUE) diff --git a/devops/Dockerfile b/devops/Dockerfile index d64a6fa..9bcc4e6 100644 --- a/devops/Dockerfile +++ b/devops/Dockerfile @@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -qq -y \ COPY devops/shiny_server.conf /etc/shiny-server/shiny-server.conf WORKDIR /srv/shinyapp/ COPY DESCRIPTION ./ -RUN Rscript -e "devtools::install_deps(dependencies = NA)" -COPY dist/*.rds ./ +RUN Rscript -e "devtools::install_deps(dependencies = NA); install.packages('R.utils')" +COPY dist/*.csv.gz ./ COPY app/ ./ RUN chmod -R a+rw .