diff --git a/DESCRIPTION b/DESCRIPTION index ad32b71e..01a09444 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,9 +27,11 @@ Imports: purrr, recipes (>= 1.0.4), rlang, + slider, targets, tibble, - tidyr + tidyr, + zeallot Suggests: ggplot2, knitr, diff --git a/NAMESPACE b/NAMESPACE index 72811f7d..3298ff06 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,15 +28,19 @@ export(make_shared_grids) export(make_target_ensemble_grid) export(make_target_param_grid) export(overprediction) -export(perform_sanity_checks) export(read_external_predictions_data) +export(rolling_mean) +export(rolling_sd) export(run_evaluation_measure) export(run_workflow_and_format) +export(sanitize_args_predictors_trainer) export(scaled_pop) export(sharpness) export(single_id) export(slide_forecaster) +export(smoothed_scaled) export(underprediction) +export(update_predictors) export(weighted_interval_score) importFrom(assertthat,assert_that) importFrom(cli,cli_abort) @@ -81,13 +85,16 @@ importFrom(epipredict,step_epi_naomit) importFrom(epipredict,step_population_scaling) importFrom(epipredict,step_training_window) importFrom(epiprocess,as_epi_df) +importFrom(epiprocess,epi_slide) importFrom(epiprocess,epix_slide) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(purrr,imap) importFrom(purrr,map) importFrom(purrr,map2_vec) +importFrom(purrr,map_chr) importFrom(purrr,map_vec) +importFrom(purrr,reduce) importFrom(purrr,transpose) importFrom(recipes,all_numeric) importFrom(rlang,"!!") @@ -96,6 +103,7 @@ importFrom(rlang,.data) importFrom(rlang,quo) importFrom(rlang,sym) importFrom(rlang,syms) +importFrom(slider,slide_dbl) importFrom(targets,tar_config_get) importFrom(targets,tar_group) importFrom(targets,tar_read) @@ -105,3 +113,4 @@ importFrom(tidyr,drop_na) importFrom(tidyr,expand_grid) importFrom(tidyr,pivot_wider) importFrom(tidyr,unnest) +importFrom(zeallot,"%<-%") diff --git a/R/data_transforms.R b/R/data_transforms.R new file mode 100644 index 00000000..7329823d --- /dev/null +++ b/R/data_transforms.R @@ -0,0 +1,114 @@ +# various reusable transforms to apply before handing to epipredict + +#' extract the non-key, non-smoothed columns from epi_data +#' @keywords internal +#' @param epi_data the `epi_df` +#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns +get_trainable_names <- function(epi_data, cols) { + if (is.null(cols)) { + cols <- get_nonkey_names(epi_data) + # exclude anything with the same naming schema as the rolling average/sd created below + cols <- cols[!grepl("_\\w{1,2}\\d+", cols)] + } + return(cols) +} + +#' just the names which aren't keys for an epi_df +#' @description +#' names, but it excludes keys +#' @param epi_data the epi_df +get_nonkey_names <- function(epi_data) { + cols <- names(epi_data) + cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] + return(cols) +} + + +#' update the predictors to only contain the smoothed/sd versions of cols +#' @description +#' modifies the list of preditors so that any which have been modified have the +#' modified versions included, and not the original. Should only be applied +#' after both rolling_mean and rolling_sd. +#' @param epi_data the epi_df, only included to get the non-key column names +#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column. +#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17"). +#' @importFrom purrr map map_chr reduce +#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact. +#' @export +update_predictors <- function(epi_data, cols_modified, predictors) { + if (!is.null(cols_modified)) { + # if cols_modified isn't null, make sure we include predictors that weren't modified + unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`) + unchanged_predictors <- predictors[unchanged_predictors] + } else { + # if it's null, we've modified every predictor + unchanged_predictors <- character(0L) + } + # all the non-key names + col_names <- get_nonkey_names(epi_data) + is_present <- function(original_predictor) { + grepl(original_predictor, col_names) & !(col_names %in% predictors) + } + is_modified <- map(predictors, is_present) %>% reduce(`|`) + new_predictors <- col_names[is_modified] + return(c(unchanged_predictors, new_predictors)) +} + +#' get a rolling average for the named columns +#' @description +#' add column(s) that are the rolling means of the specified columns, as +#' implemented by slider. Defaults to the previous 7 days. +#' Currently only group_by's on the geo_value. Should probably extend to more +#' keys if you have them +#' @param epi_data the dataset +#' @param width the number of days (or examples, the sliding isn't time-aware) to use +#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all +#' @importFrom slider slide_dbl +#' @importFrom epiprocess epi_slide +#' @export +rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { + cols_to_mean <- get_trainable_names(epi_data, cols_to_mean) + epi_data %<>% group_by(geo_value) + for (col in cols_to_mean) { + mean_name <- paste0(col, "_m", width) + epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name) + } + epi_data %<>% ungroup() + return(epi_data) +} + +#' get a rolling standard deviation for the named columns +#' @description +#' A rolling standard deviation, based off of a rolling mean. First it +#' calculates a rolling mean with width `mean_width`, and then squares the +#' difference between that and the actual value, averaged over `sd_width`. +#' @param epi_data the dataset +#' @param sd_width the number of days (or examples, the sliding isn't +#' time-aware) to use for the standard deviation calculation +#' @param mean_width like `sd_width`, but it governs the mean. Should be less +#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width` +#' (so 14 in the complete default case) +#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all +#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column +#' @importFrom epiprocess epi_slide +#' @export +rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) { + if (is.null(mean_width)) { + mean_width <- as.integer(ceiling(sd_width / 2)) + } + cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) + result <- epi_data + for (col in cols_to_sd) { + result %<>% group_by(geo_value) + mean_name <- paste0(col, "_m", mean_width) + sd_name <- paste0(col, "_sd", sd_width) + result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name) + result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name) + if (!keep_mean) { + # TODO make sure the extra info sticks around + result %<>% select(-{{ mean_name }}) + } + result %<>% dplyr_reconstruct(epi_data) + } + result %<>% ungroup() +} diff --git a/R/data_validation.R b/R/data_validation.R index da4c134e..e9319334 100644 --- a/R/data_validation.R +++ b/R/data_validation.R @@ -13,7 +13,7 @@ #' include empty strings #' @param args_list the args list created by [`epipredict::arx_args_list`] #' @export -perform_sanity_checks <- function(epi_data, +sanitize_args_predictors_trainer <- function(epi_data, outcome, predictors, trainer, @@ -56,7 +56,7 @@ perform_sanity_checks <- function(epi_data, #' @export confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) { if (!is.null(args_input$lags)) { - lag_max <- max(args_input$lags) + lag_max <- max(unlist(args_input$lags)) } else { lag_max <- 14 # default value of 2 weeks } diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R index fc56925f..083ce35f 100644 --- a/R/epipredict_utilities.R +++ b/R/epipredict_utilities.R @@ -2,25 +2,25 @@ #' add the default steps for arx_forecaster #' @description #' add the default steps for arx_forecaster -#' @param rec an [`epipredict::epi_recipe`] +#' @param preproc an [`epipredict::epi_recipe`] #' @param outcome a character of the column to be predicted #' @param predictors a character vector of the columns used as predictors #' @param args_list an [`epipredict::arx_args_list`] #' @seealso [arx_postprocess] for the layer equivalent #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window #' @export -arx_preprocess <- function(rec, outcome, predictors, args_list) { +arx_preprocess <- function(preproc, outcome, predictors, args_list) { # input already validated lags <- args_list$lags for (l in seq_along(lags)) { p <- predictors[l] - rec %<>% step_epi_lag(!!p, lag = lags[[l]]) + preproc %<>% step_epi_lag(!!p, lag = lags[[l]]) } - rec %<>% + preproc %<>% step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) - return(rec) + return(preproc) } # TODO replace with `layer_arx_forecaster` diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R index ab0c5075..33687240 100644 --- a/R/forecaster_flatline.R +++ b/R/forecaster_flatline.R @@ -40,9 +40,7 @@ flatline_fc <- function(epi_data, args_list <- do.call(flatline_args_list, args_input) # if you want to ignore extra_sources, setting predictors is the way to do it predictors <- c(outcome, extra_sources) - argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list) - args_list <- argsPredictorsTrainer[[1]] - predictors <- argsPredictorsTrainer[[2]] + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R index 6db0b8b0..6c138caa 100644 --- a/R/forecaster_scaled_pop.R +++ b/R/forecaster_scaled_pop.R @@ -35,9 +35,10 @@ #' @param quantile_levels The quantile levels to predict. Defaults to those required by #' covidhub. #' @seealso some utilities for making forecasters: [format_storage], -#' [perform_sanity_checks] +#' [sanitize_args_predictors_trainer] #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling #' @importFrom tibble tibble +#' @importFrom zeallot %<-% #' @importFrom recipes all_numeric #' @export scaled_pop <- function(epi_data, @@ -73,13 +74,10 @@ scaled_pop <- function(epi_data, args_input[["ahead"]] <- effective_ahead args_input[["quantile_levels"]] <- quantile_levels args_list <- do.call(arx_args_list, args_input) - # if you want to ignore extra_sources, setting predictors is the way to do it + # if you want to hardcode particular predictors in a particular forecaster predictors <- c(outcome, extra_sources) # TODO: Partial match quantile_level coming from here (on Dmitry's machine) - argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) - args_list <- argsPredictorsTrainer[[1]] - predictors <- argsPredictorsTrainer[[2]] - trainer <- argsPredictorsTrainer[[3]] + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R new file mode 100644 index 00000000..8428cc08 --- /dev/null +++ b/R/forecaster_smoothed_scaled.R @@ -0,0 +1,147 @@ +#' predict on smoothed data and the standard deviation +#' @description +#' This is a variant of `scaled_pop`, which predicts on a smoothed version of +#' the data. Even if the target is smoothed when used as a /predictor/, as a +#' /target/ it still uses the raw value (this captures some of the noise). It +#' also uses a rolling standard deviation as an auxillary signal, window of +#' withd `sd_width`, which by default is 28 days. +#' If you are using `sd_width`, you should restrict the lags on the `sd` to only +#' include `0`, so set your lags to be e.g. `list(c(0,7,14), c(0))`. +#' @param epi_data the actual data used +#' @param outcome the name of the target variable +#' @param extra_sources the name of any extra columns to use. This list could be +#' empty +#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is +#' likely *not* the same as the `ahead` used by epipredict, which is relative +#' to the max time value of the `epi_df`. how to handle this is a modelling +#' question left up to each forecaster; see latency_adjusting.R for the +#' existing examples) +#' @param pop_scaling bool; if `TRUE`, assume all numeric columns are on the +#' count scale and translate them to a rate scale for model fitting. +#' Predictions will be translated back to count scale. Any +#' `layer_residual_quantiles` (for non-`"quantile_reg"` `trainer`s) will be +#' done on the rate scale. When specifying predictor lags, note that rate +#' variables will use the same names as and overwrite the count variables. +#' Rates here will be counts per 100k population, based on +#' `epipredict::state_census`. +#' @param trainer optional; parsnip model specification to use for the core +#' fitting & prediction (the `spec` of the internal +#' [`epipredict::epi_workflow`]). Default is `parsnip::linear_reg()`. +#' @param smooth_width the number of days over which to do smoothing. If `NULL`, +#' then no smoothing is applied. +#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths +#' everything +#' @param sd_width the number of days over which to take a moving average of the +#' standard deviation. If `NULL`, the sd_width isn't included. +#' @param sd_mean_width to calculate the sd, we need a window size for the mean +#' used. +#' @param sd_cols the names of the columns to smooth. If `NULL` its includes +#' the sd of everything +#' @param quantile_levels The quantile levels to predict. Defaults to those +#' @param ... any additional arguments as used by [arx_args_list] +#' required by covidhub. +#' @seealso some utilities for making forecasters: [format_storage], +#' [sanitize_args_predictors_trainer] +#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling +#' @importFrom tibble tibble +#' @importFrom recipes all_numeric +#' @importFrom zeallot %<-% +#' @export +smoothed_scaled <- function(epi_data, + outcome, + extra_sources = "", + ahead = 1, + pop_scaling = TRUE, + trainer = parsnip::linear_reg(), + quantile_levels = covidhub_probs(), + smooth_width = 7, + smooth_cols = NULL, + sd_width = 28, + sd_mean_width = 14, + sd_cols = NULL, + ...) { + # perform any preprocessing not supported by epipredict + # this is a temp fix until a real fix gets put into epipredict + epi_data <- clear_lastminute_nas(epi_data) + # one that every forecaster will need to handle: how to manage max(time_value) + # that's older than the `as_of` date + c(epi_data, effective_ahead) %<-% extend_ahead(epi_data, ahead) + # see latency_adjusting for other examples + args_input <- list(...) + # edge case where there is no data or less data than the lags; eventually epipredict will handle this + if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) { + null_result <- epi_data[0L, c("geo_value", attr(epi_data, "metadata", exact = TRUE)[["other_keys"]])] %>% + mutate( + forecast_date = epi_data$time_value[0], + target_end_date = epi_data$time_value[0], + quantile = numeric(), + value = numeric() + ) + return(null_result) + } + args_input[["ahead"]] <- effective_ahead + args_input[["quantile_levels"]] <- quantile_levels + args_list <- do.call(arx_args_list, args_input) + # `extra_sources` sets which variables beyond the outcome are lagged and used as predictors + # any which are modified by `rolling_mean` or `rolling_sd` have their original values dropped later + predictors <- c(outcome, extra_sources) + # end of the copypasta + # finally, any other pre-processing (e.g. smoothing) that isn't performed by + # epipredict + # smoothing + keep_mean <- !is.null(smooth_width) && !is.null(sd_mean_width) && + smooth_width == sd_mean_width # do we (not) need to do the mean separately? + if (!is.null(smooth_width) && !keep_mean) { + epi_data %<>% rolling_mean( + width = smooth_width, + cols_to_mean = smooth_cols + ) + } + + # measuring standard deviation + if (!is.null(sd_width)) { + epi_data %<>% rolling_sd( + sd_width = sd_width, + mean_width = sd_mean_width, + cols_to_sd = sd_cols, + keep_mean = keep_mean + ) + } + # and need to make sure we exclude the original variables as predictors + predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors) + # TODO: Partial match quantile_level coming from here (on Dmitry's machine) + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) + # preprocessing supported by epipredict + preproc <- epi_recipe(epi_data) + if (pop_scaling) { + preproc %<>% step_population_scaling( + all_numeric(), + df = epipredict::state_census, + df_pop_col = "pop", + create_new = FALSE, + rate_rescaling = 1e5, + by = c("geo_value" = "abbr") + ) + } + preproc %<>% arx_preprocess(outcome, predictors, args_list) + + # postprocessing supported by epipredict + postproc <- frosting() + postproc %<>% arx_postprocess(trainer, args_list) + if (pop_scaling) { + postproc %<>% layer_population_scaling( + .pred, .pred_distn, + df = epipredict::state_census, + df_pop_col = "pop", + create_new = FALSE, + rate_rescaling = 1e5, + by = c("geo_value" = "abbr") + ) + } + # with all the setup done, we execute and format + pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data) + # now pred has the columns + # (geo_value, forecast_date, target_end_date, quantile, value) + # finally, any postprocessing not supported by epipredict e.g. calibration + return(pred) +} diff --git a/R/targets_utils.R b/R/targets_utils.R index feac27c2..d3fd435e 100644 --- a/R/targets_utils.R +++ b/R/targets_utils.R @@ -127,19 +127,19 @@ make_shared_grids <- function() { forecaster = "scaled_pop", trainer = c("linreg", "quantreg"), ahead = c(1:7, 14, 21, 28), - pop_scaling = c(FALSE) + pop_scaling = FALSE ), tidyr::expand_grid( forecaster = "scaled_pop", trainer = c("linreg", "quantreg"), ahead = c(1:7, 14, 21, 28), - lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)), - pop_scaling = c(FALSE) + lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)), + pop_scaling = FALSE ), tidyr::expand_grid( forecaster = "flatline_fc", - ahead = 1:7 - ) + ahead = c(1:7, 14, 21, 28) + ), ) } #' Make list of common ensembles for forecasting experiments across projects diff --git a/app.R b/app.R index c5ecf1a2..064c4ffe 100644 --- a/app.R +++ b/app.R @@ -170,7 +170,7 @@ shinyApp( verticalLayout( plotlyOutput("main_plot", height = "90em"), h2("forecaster name -> parameters"), - #textOutput("forecaster_param_title"), + # textOutput("forecaster_param_title"), dataTableOutput("forecaster_table"), h2("ensemble name -> parameters"), dataTableOutput("ensemble_table") @@ -261,7 +261,7 @@ shinyApp( # Use scatterplot or lines depending on the x var. { if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) { - . + geom_point(aes(size = n)) + expand_limits(size = 0) + . + geom_point(aes(size = n / 100)) + expand_limits(size = 0) } else { . + geom_line() } diff --git a/covid_hosp_explore.R b/covid_hosp_explore.R index f9aa0d8e..456b4b3e 100644 --- a/covid_hosp_explore.R +++ b/covid_hosp_explore.R @@ -18,6 +18,13 @@ make_unique_grids <- function() { ahead = c(1:7, 14, 21, 28), lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)), pop_scaling = TRUE + ), + tidyr::expand_grid( + forecaster = "smoothed_scaled", + trainer = c("quantreg"), + ahead = c(1:7, 14, 21, 28), + lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))), + pop_scaling = TRUE ) ) } diff --git a/flu_hosp_explore.R b/flu_hosp_explore.R index a3486080..31d0ffc1 100644 --- a/flu_hosp_explore.R +++ b/flu_hosp_explore.R @@ -5,7 +5,15 @@ source("extras/targets-common.R") # Add custom parameter combinations in the list below. make_unique_grids <- function() { - list() + list( + tidyr::expand_grid( + forecaster = "smoothed_scaled", + trainer = c("quantreg"), + ahead = c(1:7, 14, 21, 28), + lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))), + pop_scaling = FALSE + ) + ) } make_unique_ensemble_grid <- function() { tribble( diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd index 35e2e0b0..44ea895a 100644 --- a/man/arx_preprocess.Rd +++ b/man/arx_preprocess.Rd @@ -4,10 +4,10 @@ \alias{arx_preprocess} \title{add the default steps for arx_forecaster} \usage{ -arx_preprocess(rec, outcome, predictors, args_list) +arx_preprocess(preproc, outcome, predictors, args_list) } \arguments{ -\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} +\item{preproc}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} \item{outcome}{a character of the column to be predicted} diff --git a/man/get_nonkey_names.Rd b/man/get_nonkey_names.Rd new file mode 100644 index 00000000..242ebabd --- /dev/null +++ b/man/get_nonkey_names.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{get_nonkey_names} +\alias{get_nonkey_names} +\title{just the names which aren't keys for an epi_df} +\usage{ +get_nonkey_names(epi_data) +} +\arguments{ +\item{epi_data}{the epi_df} +} +\description{ +names, but it excludes keys +} diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd new file mode 100644 index 00000000..8f5faf10 --- /dev/null +++ b/man/get_trainable_names.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{get_trainable_names} +\alias{get_trainable_names} +\title{extract the non-key, non-smoothed columns from epi_data} +\usage{ +get_trainable_names(epi_data, cols) +} +\arguments{ +\item{epi_data}{the \code{epi_df}} + +\item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns} +} +\description{ +extract the non-key, non-smoothed columns from epi_data +} +\keyword{internal} diff --git a/man/rolling_mean.Rd b/man/rolling_mean.Rd new file mode 100644 index 00000000..f8cf5000 --- /dev/null +++ b/man/rolling_mean.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{rolling_mean} +\alias{rolling_mean} +\title{get a rolling average for the named columns} +\usage{ +rolling_mean(epi_data, width = 7L, cols_to_mean = NULL) +} +\arguments{ +\item{epi_data}{the dataset} + +\item{width}{the number of days (or examples, the sliding isn't time-aware) to use} + +\item{cols_to_mean}{the non-key columns to take the mean over. \code{NULL} means all} +} +\description{ +add column(s) that are the rolling means of the specified columns, as +implemented by slider. Defaults to the previous 7 days. +Currently only group_by's on the geo_value. Should probably extend to more +keys if you have them +} diff --git a/man/rolling_sd.Rd b/man/rolling_sd.Rd new file mode 100644 index 00000000..22f3ec0c --- /dev/null +++ b/man/rolling_sd.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{rolling_sd} +\alias{rolling_sd} +\title{get a rolling standard deviation for the named columns} +\usage{ +rolling_sd( + epi_data, + sd_width = 28L, + mean_width = NULL, + cols_to_sd = NULL, + keep_mean = FALSE +) +} +\arguments{ +\item{epi_data}{the dataset} + +\item{sd_width}{the number of days (or examples, the sliding isn't +time-aware) to use for the standard deviation calculation} + +\item{mean_width}{like \code{sd_width}, but it governs the mean. Should be less +than the \code{sd_width}, and if \code{NULL} (the default) it is half of \code{sd_width} +(so 14 in the complete default case)} + +\item{cols_to_sd}{the non-key columns to take the sd over. \code{NULL} means all} + +\item{keep_mean}{bool, if \code{TRUE}, it retains keeps the mean column} +} +\description{ +A rolling standard deviation, based off of a rolling mean. First it +calculates a rolling mean with width \code{mean_width}, and then squares the +difference between that and the actual value, averaged over \code{sd_width}. +} diff --git a/man/perform_sanity_checks.Rd b/man/sanitize_args_predictors_trainer.Rd similarity index 85% rename from man/perform_sanity_checks.Rd rename to man/sanitize_args_predictors_trainer.Rd index 152b3bf1..b56706ee 100644 --- a/man/perform_sanity_checks.Rd +++ b/man/sanitize_args_predictors_trainer.Rd @@ -1,10 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_validation.R -\name{perform_sanity_checks} -\alias{perform_sanity_checks} +\name{sanitize_args_predictors_trainer} +\alias{sanitize_args_predictors_trainer} \title{helper function for those writing forecasters} \usage{ -perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) +sanitize_args_predictors_trainer( + epi_data, + outcome, + predictors, + trainer, + args_list +) } \arguments{ \item{epi_data}{the actual data used} diff --git a/man/scaled_pop.Rd b/man/scaled_pop.Rd index 880819eb..5522c85c 100644 --- a/man/scaled_pop.Rd +++ b/man/scaled_pop.Rd @@ -63,5 +63,5 @@ arguments: } \seealso{ some utilities for making forecasters: \link{format_storage}, -\link{perform_sanity_checks} +\link{sanitize_args_predictors_trainer} } diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd new file mode 100644 index 00000000..16a3d2c0 --- /dev/null +++ b/man/smoothed_scaled.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/forecaster_smoothed_scaled.R +\name{smoothed_scaled} +\alias{smoothed_scaled} +\title{predict on smoothed data and the standard deviation} +\usage{ +smoothed_scaled( + epi_data, + outcome, + extra_sources = "", + ahead = 1, + pop_scaling = TRUE, + trainer = parsnip::linear_reg(), + quantile_levels = covidhub_probs(), + smooth_width = 7, + smooth_cols = NULL, + sd_width = 28, + sd_mean_width = 14, + sd_cols = NULL, + ... +) +} +\arguments{ +\item{epi_data}{the actual data used} + +\item{outcome}{the name of the target variable} + +\item{extra_sources}{the name of any extra columns to use. This list could be +empty} + +\item{ahead}{(this is relative to the \code{as_of} field of the \code{epi_df}, which is +likely \emph{not} the same as the \code{ahead} used by epipredict, which is relative +to the max time value of the \code{epi_df}. how to handle this is a modelling +question left up to each forecaster; see latency_adjusting.R for the +existing examples)} + +\item{pop_scaling}{bool; if \code{TRUE}, assume all numeric columns are on the +count scale and translate them to a rate scale for model fitting. +Predictions will be translated back to count scale. Any +\code{layer_residual_quantiles} (for non-\code{"quantile_reg"} \code{trainer}s) will be +done on the rate scale. When specifying predictor lags, note that rate +variables will use the same names as and overwrite the count variables. +Rates here will be counts per 100k population, based on +\code{epipredict::state_census}.} + +\item{trainer}{optional; parsnip model specification to use for the core +fitting & prediction (the \code{spec} of the internal +\code{\link[epipredict:epi_workflow]{epipredict::epi_workflow}}). Default is \code{parsnip::linear_reg()}.} + +\item{quantile_levels}{The quantile levels to predict. Defaults to those} + +\item{smooth_width}{the number of days over which to do smoothing. If \code{NULL}, +then no smoothing is applied.} + +\item{smooth_cols}{the names of the columns to smooth. If \code{NULL} it smooths +everything} + +\item{sd_width}{the number of days over which to take a moving average of the +standard deviation. If \code{NULL}, the sd_width isn't included.} + +\item{sd_mean_width}{to calculate the sd, we need a window size for the mean +used.} + +\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes +the sd of everything} + +\item{...}{any additional arguments as used by \link{arx_args_list} +required by covidhub.} +} +\description{ +This is a variant of \code{scaled_pop}, which predicts on a smoothed version of +the data. Even if the target is smoothed when used as a /predictor/, as a +/target/ it still uses the raw value (this captures some of the noise). It +also uses a rolling standard deviation as an auxillary signal, window of +withd \code{sd_width}, which by default is 28 days. +If you are using \code{sd_width}, you should restrict the lags on the \code{sd} to only +include \code{0}, so set your lags to be e.g. \code{list(c(0,7,14), c(0))}. +} +\seealso{ +some utilities for making forecasters: \link{format_storage}, +\link{sanitize_args_predictors_trainer} +} diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd new file mode 100644 index 00000000..435e5aba --- /dev/null +++ b/man/update_predictors.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{update_predictors} +\alias{update_predictors} +\title{update the predictors to only contain the smoothed/sd versions of cols} +\usage{ +update_predictors(epi_data, cols_modified, predictors) +} +\arguments{ +\item{epi_data}{the epi_df, only included to get the non-key column names} + +\item{cols_modified}{the list of columns which have been modified. If this is \code{NULL}, that means we were modifying every column.} + +\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").} +} +\value{ +returns an updated list of predictors, with modified columns replaced and non-modified columns left intact. +} +\description{ +modifies the list of preditors so that any which have been modified have the +modified versions included, and not the original. Should only be applied +after both rolling_mean and rolling_sd. +} diff --git a/tests/testthat/test-forecaster-utils.R b/tests/testthat/test-forecaster-utils.R index 863acc80..dfe5f4ed 100644 --- a/tests/testthat/test-forecaster-utils.R +++ b/tests/testthat/test-forecaster-utils.R @@ -1,10 +1,10 @@ -test_that("perform_sanity_checks", { +test_that("sanitize_args_predictors_trainer", { epi_data <- epipredict::case_death_rate_subset # don't need to test validate_forecaster_inputs as that's inherited # testing args_list inheritance ex_args <- arx_args_list() - expect_error(perform_sanity_checks(epi_data, "case_rate", c("case_rate"), 5, ex_args)) - argsPredictors <- perform_sanity_checks( + expect_error(sanitize_args_predictors_trainer(epi_data, "case_rate", c("case_rate"), 5, ex_args)) + argsPredictors <- sanitize_args_predictors_trainer( epi_data, "case_rate", c("case_rate", ""), parsnip::linear_reg(), ex_args ) args_list <- argsPredictors[[1]] diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R index c9d7a929..b0cc1da4 100644 --- a/tests/testthat/test-forecasters-basics.R +++ b/tests/testthat/test-forecasters-basics.R @@ -2,8 +2,10 @@ library(dplyr) # TODO better way to do this than copypasta forecasters <- list( c("scaled_pop", scaled_pop), - c("flatline_fc", flatline_fc) + c("flatline_fc", flatline_fc), + c("smoothed_scaled", smoothed_scaled) ) +forecaster <- c("scaled_pop", scaled_pop) for (forecaster in forecasters) { test_that(paste(forecaster[[1]], "gets the date and columns right"), { jhu <- epipredict::case_death_rate_subset %>% diff --git a/tests/testthat/test-forecasters-data.R b/tests/testthat/test-forecasters-data.R index 365811dd..4fc65f5d 100644 --- a/tests/testthat/test-forecasters-data.R +++ b/tests/testthat/test-forecasters-data.R @@ -6,8 +6,10 @@ forecasters <- tibble::tribble( ~forecaster, ~extra_params, ~extra_params_names, ~fc_name, scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"), "scaled_pop", scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"), "scaled_pop", - flatline_fc, list(1), list("ahead"), "flatline_fc" + flatline_fc, list(1), list("ahead"), "flatline_fc", + smoothed_scaled, list(1, list(c(0,7,14), c(0))), list("ahead", "lags"), "smoothed_scaled" ) +expects_nonequal <- c("scaled_pop", "smoothed_scaled") synth_mean <- 25 synth_sd <- 2 tiny_sd <- 1.0e-5 @@ -49,7 +51,7 @@ different_constants <- epiprocess::as_epi_archive(rbind( )) for (ii in 1:nrow(forecasters)) { test_that(paste(forecasters$fc_name[[ii]], " predicts a constant median for constant data"), { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(different_constants, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(different_constants, ii) @@ -126,7 +128,7 @@ missing_state <- epiprocess::as_epi_archive(rbind( )) for (ii in seq_len(nrow(forecasters))) { test_that(paste(forecasters$fc_name[[ii]], "predicts well in the presence of only one state with variably delayed data"), { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(missing_state, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(missing_state, ii) @@ -171,7 +173,7 @@ for (ii in seq_len(nrow(forecasters))) { test_that(paste(forecasters$fc_name[[ii]], "predicts a linear increasing slope correctly"), { # flatline will definitely fail this, so it's exempt if (!identical(forecasters$forecaster[[ii]], flatline_fc)) { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(linear, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(linear, ii) diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R new file mode 100644 index 00000000..fdea75b2 --- /dev/null +++ b/tests/testthat/test-transforms.R @@ -0,0 +1,100 @@ +library(dplyr) +n_days <- 40 +removed_date <- 10 +simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days) +simple_dates <- simple_dates[-removed_date] +rand_vals <- rnorm(n_days - 1) + +# Two states, with 2 variables. a is linear, going up in one state and down in the other +# b is just random +# note that day 10 is missing +epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = simple_dates, + a = 1:(n_days - 1), + b = rand_vals +), tibble( + geo_value = "ca", + time_value = simple_dates, + a = (n_days - 1):1, + b = rand_vals + 10 +))) +test_that("rolling_mean generates correct mean", { + rolled <- rolling_mean(epi_data) + rolled + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7")) + # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4 + linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 7), seq(from = 5, to = 36, by = 1)) + # day 10 is missing, so the average days 11-16 are thrown off, only using 6 values instead of 7 + gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a) + unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):((gap_starts + d) - 5))) + # stitching the lag induced hiccup into the "normal" mean values + expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[16:(n_days - 1)]) + expected_mean + + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), expected_mean) + # Doing the same for California + # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 + linear_reverse_roll_mean <- c(seq(from = 39, by = -0.5, length.out = 7), seq(from = 35, to = 4, by = -1)) + # day 10 is missing, so days 11-16 are thrown off + gap_starts <- epi_data %>% filter(geo_value == "ca" & time_value == as.Date("2012-01-11")) %>% pull(a) + unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts - d) + 0):((gap_starts - d) + 5))) + # stitching the lag induced hiccup into the "normal" mean values + expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[16:(n_days - 1)]) + # actually testing + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), expected_mean) + + # making sure the type is maintained + expect_true("epi_df" %in% class(rolled)) +}) + +test_that("rolling_sd generates correct standard deviation", { + rolled <- rolling_sd(epi_data, keep_mean = TRUE) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) + # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 + linear_roll_mean <- c(seq(from = 1, to = 7, by = .5), seq(from = 8, to = 16, by = 1), seq(from = 16.5, to = 32.5, by = 1)) +## linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 14), seq(from = 8.5, to = 32.5, by = 1)) +## gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a) +## unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 14, 1))) +## map(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 13, 1))) +## linear_roll_mean +## rolled %>% filter(geo_value == "al") %>% pull("a_m14") + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean) + # and the standard deviation is + linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1) + linear_values <- 1:39 + linear_values <- append(linear_values, NA, after = removed_date - 1) + linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28 - 1)) + # drop the extra date caused by the inclusion of the NAs + linear_roll_sd <- linear_roll_sd[-(removed_date)] + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd) + # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_sd28"), linear_roll_sd) + # doesn't break types + expect_true("epi_df" %in% class(rolled)) +}) + +test_that("get_trainable_names pulls out mean and sd columns", { + rolled <- rolling_sd(epi_data, keep_mean = TRUE) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) + expect_equal(get_trainable_names(rolled, NULL), c("a", "b")) +}) +# TODO example with NA's, example with missing days, only one column, keep_mean + +test_that("update_predictors keeps unmodified predictors", { + epi_data["c"] <- NaN + epi_data["d"] <- NaN + epi_data["b_m14"] <- NaN + epi_data["b_sd28"] <- NaN + predictors <- c("a", "b", "c") # everything but d + modified <- c("b", "c") # we want to exclude b but not its modified versions + expected_predictors <- c("a", "b_m14", "b_sd28") + expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors) + expected_if_all_modified <- c("b_m14", "b_sd28") + expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified) +}) + +test_that("rolling_sd doesn't keep the mean columns by default", { + rolled <- rolling_sd(epi_data) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28")) +})