From e4b3d4515c7bd32366aafd8fcfaeafac8fd7e0a3 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 8 Dec 2023 16:44:12 -0800 Subject: [PATCH 01/20] feat: rolling_mean/sd for a new forecaster --- DESCRIPTION | 1 + NAMESPACE | 5 +++ R/data_transforms.R | 69 ++++++++++++++++++++++++++++++++ man/get_trainable_names.Rd | 17 ++++++++ man/rolling_mean.Rd | 21 ++++++++++ man/rolling_sd.Rd | 33 +++++++++++++++ man/smooth_scaled.Rd | 65 ++++++++++++++++++++++++++++++ tests/testthat/test-transforms.R | 37 +++++++++++++++++ 8 files changed, 248 insertions(+) create mode 100644 R/data_transforms.R create mode 100644 man/get_trainable_names.Rd create mode 100644 man/rolling_mean.Rd create mode 100644 man/rolling_sd.Rd create mode 100644 man/smooth_scaled.Rd create mode 100644 tests/testthat/test-transforms.R diff --git a/DESCRIPTION b/DESCRIPTION index ad32b71e..21fbcc91 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: purrr, recipes (>= 1.0.4), rlang, + slider, targets, tibble, tidyr diff --git a/NAMESPACE b/NAMESPACE index 72811f7d..849bac7c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,12 +30,15 @@ export(make_target_param_grid) export(overprediction) export(perform_sanity_checks) export(read_external_predictions_data) +export(rolling_mean) +export(rolling_sd) export(run_evaluation_measure) export(run_workflow_and_format) export(scaled_pop) export(sharpness) export(single_id) export(slide_forecaster) +export(smooth_scaled) export(underprediction) export(weighted_interval_score) importFrom(assertthat,assert_that) @@ -96,6 +99,8 @@ importFrom(rlang,.data) importFrom(rlang,quo) importFrom(rlang,sym) importFrom(rlang,syms) +importFrom(slider,slide2_dbl) +importFrom(slider,slide_dbl) importFrom(targets,tar_config_get) importFrom(targets,tar_group) importFrom(targets,tar_read) diff --git a/R/data_transforms.R b/R/data_transforms.R new file mode 100644 index 00000000..99da29a6 --- /dev/null +++ b/R/data_transforms.R @@ -0,0 +1,69 @@ +# various reusable transforms to apply before handing to epipredict + +#' extract the non-key columns from epi_data +#' @keywords internal +#' @param epi_data the epi_data tibble +#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns +get_trainable_names <- function(epi_data, cols) { + if (is.null(cols)) { + cols <- names(epi_data) + cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] + } + return(cols) +} + +#' get a rolling average for the named columns +#' @description +#' add column(s) that are the rolling means of the specified columns, as +#' implemented by slider. Defaults to the previous 7 days. +#' Currently only group_by's on the geo_value. Should probably extend to more +#' keys if you have them +#' @param epi_data the dataset +#' @param width the number of days (or examples, the sliding isn't time-aware) to use +#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all +#' @importFrom slider slide_dbl +#' @export +rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { + cols_to_mean <- get_trainable_names(epi_data, cols_to_mean) + epi_data %<>% group_by(geo_value) + for (col in cols_to_mean) { + mean_name <- paste0(col, width) + epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width)) + } + epi_data %<>% ungroup() + return(epi_data) +} + +#' get a rolling standard deviation for the named columns +#' @description +#' A rolling standard deviation, based off of a rolling mean. First it +#' calculates a rolling mean with width `mean_width`, and then squares the +#' difference between that and the actual value, averaged over `sd_width`. +#' @param epi_data the dataset +#' @param sd_width the number of days (or examples, the sliding isn't +#' time-aware) to use for the standard deviation calculation +#' @param mean_width like `sd_width`, but it governs the mean. Should be less +#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width` +#' (so 14 in the complete default case) +#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all +#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column +#' @importFrom slider slide_dbl slide2_dbl +#' @export +rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) { + if (is.null(mean_width)) { + mean_width <- as.integer(ceiling(sd_width / 2)) + } + cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) + epi_data %<>% group_by(geo_value) + for (col in cols_to_sd) { + mean_name <- paste0(col, "_m", mean_width) + sd_name <- paste0(col, "_SD", sd_width) + epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) + epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) + if (!keep_mean) { + epi_data %<>% select(-{{ mean_name }}) + } + } + epi_data %<>% ungroup() + return(epi_data) +} diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd new file mode 100644 index 00000000..42650dbe --- /dev/null +++ b/man/get_trainable_names.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{get_trainable_names} +\alias{get_trainable_names} +\title{extract the non-key columns from epi_data} +\usage{ +get_trainable_names(epi_data, cols) +} +\arguments{ +\item{epi_data}{the epi_data tibble} + +\item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns} +} +\description{ +extract the non-key columns from epi_data +} +\keyword{internal} diff --git a/man/rolling_mean.Rd b/man/rolling_mean.Rd new file mode 100644 index 00000000..f8cf5000 --- /dev/null +++ b/man/rolling_mean.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{rolling_mean} +\alias{rolling_mean} +\title{get a rolling average for the named columns} +\usage{ +rolling_mean(epi_data, width = 7L, cols_to_mean = NULL) +} +\arguments{ +\item{epi_data}{the dataset} + +\item{width}{the number of days (or examples, the sliding isn't time-aware) to use} + +\item{cols_to_mean}{the non-key columns to take the mean over. \code{NULL} means all} +} +\description{ +add column(s) that are the rolling means of the specified columns, as +implemented by slider. Defaults to the previous 7 days. +Currently only group_by's on the geo_value. Should probably extend to more +keys if you have them +} diff --git a/man/rolling_sd.Rd b/man/rolling_sd.Rd new file mode 100644 index 00000000..22f3ec0c --- /dev/null +++ b/man/rolling_sd.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{rolling_sd} +\alias{rolling_sd} +\title{get a rolling standard deviation for the named columns} +\usage{ +rolling_sd( + epi_data, + sd_width = 28L, + mean_width = NULL, + cols_to_sd = NULL, + keep_mean = FALSE +) +} +\arguments{ +\item{epi_data}{the dataset} + +\item{sd_width}{the number of days (or examples, the sliding isn't +time-aware) to use for the standard deviation calculation} + +\item{mean_width}{like \code{sd_width}, but it governs the mean. Should be less +than the \code{sd_width}, and if \code{NULL} (the default) it is half of \code{sd_width} +(so 14 in the complete default case)} + +\item{cols_to_sd}{the non-key columns to take the sd over. \code{NULL} means all} + +\item{keep_mean}{bool, if \code{TRUE}, it retains keeps the mean column} +} +\description{ +A rolling standard deviation, based off of a rolling mean. First it +calculates a rolling mean with width \code{mean_width}, and then squares the +difference between that and the actual value, averaged over \code{sd_width}. +} diff --git a/man/smooth_scaled.Rd b/man/smooth_scaled.Rd new file mode 100644 index 00000000..65698932 --- /dev/null +++ b/man/smooth_scaled.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/forecaster_smoothed_scaled.R +\name{smooth_scaled} +\alias{smooth_scaled} +\title{predict on smoothed data and the standard deviation} +\usage{ +smooth_scaled( + epi_data, + outcome, + extra_sources = "", + ahead = 1, + pop_scaling = TRUE, + trainer = parsnip::linear_reg(), + quantile_levels = covidhub_probs(), + smooth_days = 7, + smooth_cols = NULL, + sd_days = 28, + sd_cols = NULL, + ... +) +} +\arguments{ +\item{epi_data}{the actual data used} + +\item{outcome}{the name of the target variable} + +\item{extra_sources}{the name of any extra columns to use. This list could be +empty} + +\item{ahead}{(this is relative to the \code{as_of} field of the \code{epi_df}, which is +likely \emph{not} the same as the \code{ahead} used by epipredict, which is relative +to the max time value of the \code{epi_df}. how to handle this is a modelling +question left up to each forecaster; see latency_adjusting.R for the +existing examples)} + +\item{pop_scaling}{an example extra parameter unique to this forecaster} + +\item{trainer}{an example extra parameter that is fairly common} + +\item{quantile_levels}{The quantile levels to predict. Defaults to those +required by covidhub.} + +\item{smooth_days}{the number of days over which to do smoothing. If \code{NULL}, +then no smoothing is applied.} + +\item{smooth_cols}{the names of the columns to smooth. If \code{NULL} it smooths +everything} + +\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes +the sd ofeverything} + +\item{sd_width}{the number of days over which to take a moving average of the +standard deviation. If \code{NULL}, the sd_width isn't included.} +} +\description{ +This is a variant of \code{scaled_pop}, which predicts on a smoothed version of +the data. Even if the target is smoothed when used as a /predictor/, as a +/target/ it still uses the raw value (this captures some of the noise). It +also uses a rolling standard deviation as an auxillary signal, window of +withd \code{sd_width}, which by default is 28 days. +} +\seealso{ +some utilities for making forecasters: \link{format_storage}, +\link{perform_sanity_checks} +} diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R new file mode 100644 index 00000000..ea762391 --- /dev/null +++ b/tests/testthat/test-transforms.R @@ -0,0 +1,37 @@ +n_days <- 40 +simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days) +rand_vals <- rnorm(n_days) +epi_data <- epiprocess::as_epi_df(rbind(tibble( +geo_value = "al", +time_value = simple_dates, +a = 1:n_days, +b = rand_vals +), tibble( +geo_value = "ca", +time_value = simple_dates, +a = n_days:1, +b = rand_vals + 10 +))) +test_that("rolling_mean generates correct mean", { + rolled <- rolling_mean(epi_data) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7")) + # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4 + linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1)) + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean) + # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 + linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1)) + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean) +}) + +test_that("rolling_sd generates correct standard deviation", { + rolled <- rolling_sd(epi_data) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28")) + # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 + linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1)) + # and the standard deviation is + linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28)) + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd) + # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd) + }) +# TODO example with NA's, example with missing days, only one column, keep_mean From 4a788103a7a7dcf4ad949c767e931e546d8edf99 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 11 Dec 2023 15:27:36 -0800 Subject: [PATCH 02/20] consistent name, only smooth non-smoothed, init forecaster --- NAMESPACE | 2 +- R/data_transforms.R | 6 +- R/forecaster_smoothed_scaled.R | 138 +++++++++++++++++++ man/get_trainable_names.Rd | 4 +- man/{smooth_scaled.Rd => smoothed_scaled.Rd} | 22 +-- tests/testthat/test-forecasters-basics.R | 3 +- tests/testthat/test-transforms.R | 35 +++-- 7 files changed, 180 insertions(+), 30 deletions(-) create mode 100644 R/forecaster_smoothed_scaled.R rename man/{smooth_scaled.Rd => smoothed_scaled.Rd} (86%) diff --git a/NAMESPACE b/NAMESPACE index 849bac7c..f65be2b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,7 +38,7 @@ export(scaled_pop) export(sharpness) export(single_id) export(slide_forecaster) -export(smooth_scaled) +export(smoothed_scaled) export(underprediction) export(weighted_interval_score) importFrom(assertthat,assert_that) diff --git a/R/data_transforms.R b/R/data_transforms.R index 99da29a6..43164f4a 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -1,6 +1,6 @@ # various reusable transforms to apply before handing to epipredict -#' extract the non-key columns from epi_data +#' extract the non-key, non-smoothed columns from epi_data #' @keywords internal #' @param epi_data the epi_data tibble #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns @@ -8,6 +8,8 @@ get_trainable_names <- function(epi_data, cols) { if (is.null(cols)) { cols <- names(epi_data) cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] + # exclude anything with the same naming schema as the rolling average/sd created below + cols <- cols[!grepl("_\\w{1,2}\\d+", cols)] } return(cols) } @@ -27,7 +29,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { cols_to_mean <- get_trainable_names(epi_data, cols_to_mean) epi_data %<>% group_by(geo_value) for (col in cols_to_mean) { - mean_name <- paste0(col, width) + mean_name <- paste0(col, "_m", width) epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width)) } epi_data %<>% ungroup() diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R new file mode 100644 index 00000000..0edbaa78 --- /dev/null +++ b/R/forecaster_smoothed_scaled.R @@ -0,0 +1,138 @@ +#' predict on smoothed data and the standard deviation +#' @description +#' This is a variant of `scaled_pop`, which predicts on a smoothed version of +#' the data. Even if the target is smoothed when used as a /predictor/, as a +#' /target/ it still uses the raw value (this captures some of the noise). It +#' also uses a rolling standard deviation as an auxillary signal, window of +#' withd `sd_width`, which by default is 28 days. +#' @param epi_data the actual data used +#' @param outcome the name of the target variable +#' @param extra_sources the name of any extra columns to use. This list could be +#' empty +#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is +#' likely *not* the same as the `ahead` used by epipredict, which is relative +#' to the max time value of the `epi_df`. how to handle this is a modelling +#' question left up to each forecaster; see latency_adjusting.R for the +#' existing examples) +#' @param pop_scaling an example extra parameter unique to this forecaster +#' @param trainer an example extra parameter that is fairly common +#' @param smooth_width the number of days over which to do smoothing. If `NULL`, +#' then no smoothing is applied. +#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths +#' everything +#' @param sd_width the number of days over which to take a moving average of the +#' standard deviation. If `NULL`, the sd_width isn't included. +#' @param sd_mean_width to calculate the sd, we need a window size for the mean +#' used. +#' @param sd_cols the names of the columns to smooth. If `NULL` its includes +#' the sd of everything +#' @param quantile_levels The quantile levels to predict. Defaults to those +#' required by covidhub. +#' @seealso some utilities for making forecasters: [format_storage], +#' [perform_sanity_checks] +#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling +#' @importFrom tibble tibble +#' @importFrom recipes all_numeric +#' @export +smoothed_scaled <- function(epi_data, + outcome, + extra_sources = "", + ahead = 1, + pop_scaling = TRUE, + trainer = parsnip::linear_reg(), + quantile_levels = covidhub_probs(), + smooth_width = 7, + smooth_cols = NULL, + sd_width = 28, + sd_mean_width = 14, + sd_cols = NULL, + ...) { + # perform any preprocessing not supported by epipredict + # this is a temp fix until a real fix gets put into epipredict + epi_data <- clear_lastminute_nas(epi_data) + # one that every forecaster will need to handle: how to manage max(time_value) + # that's older than the `as_of` date + epidataAhead <- extend_ahead(epi_data, ahead) + # see latency_adjusting for other examples + # this next part is basically unavoidable boilerplate you'll want to copy + epi_data <- epidataAhead[[1]] + effective_ahead <- epidataAhead[[2]] + args_input <- list(...) + # edge case where there is no data or less data than the lags; eventually epipredict will handle this + if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) { + null_result <- tibble( + geo_value = character(), + forecast_date = lubridate::Date(), + target_end_date = lubridate::Date(), + quantile = numeric(), + value = numeric() + ) + return(null_result) + } + args_input[["ahead"]] <- effective_ahead + args_input[["quantile_levels"]] <- quantile_levels + args_list <- do.call(arx_args_list, args_input) + # if you want to ignore extra_sources, setting predictors is the way to do it + predictors <- c(outcome, extra_sources) + # TODO: Partial match quantile_level coming from here (on Dmitry's machine) + argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) + args_list <- argsPredictorsTrainer[[1]] + predictors <- argsPredictorsTrainer[[2]] + trainer <- argsPredictorsTrainer[[3]] + # end of the copypasta + # finally, any other pre-processing (e.g. smoothing) that isn't performed by + # epipredict + # smoothing + keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately? + if (!is.null(smooth_width) && !keep_mean) { + epi_data %<>% rolling_mean( + width = smooth_width, + cols_to_mean = smooth_cols + ) + } + + # measuring standard deviation + if (!is.null(sd_width)) { + epi_data %<>% rolling_sd( + sd_width = sd_width, + mean_width = sd_mean_width, + cols_to_sd = sd_cols, + keep_mean = keep_mean + ) + } + # even + + # preprocessing supported by epipredict + preproc <- epi_recipe(epi_data) + if (pop_scaling) { + preproc %<>% step_population_scaling( + all_numeric(), + df = epipredict::state_census, + df_pop_col = "pop", + create_new = FALSE, + rate_rescaling = 1e5, + by = c("geo_value" = "abbr") + ) + } + preproc %<>% arx_preprocess(outcome, predictors, args_list) + + # postprocessing supported by epipredict + postproc <- frosting() + postproc %<>% arx_postprocess(trainer, args_list) + if (pop_scaling) { + postproc %<>% layer_population_scaling( + .pred, .pred_distn, + df = epipredict::state_census, + df_pop_col = "pop", + create_new = FALSE, + rate_rescaling = 1e5, + by = c("geo_value" = "abbr") + ) + } + # with all the setup done, we execute and format + pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data) + # now pred has the columns + # (geo_value, forecast_date, target_end_date, quantile, value) + # finally, any postprocessing not supported by epipredict e.g. calibration + return(pred) +} diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd index 42650dbe..700c9216 100644 --- a/man/get_trainable_names.Rd +++ b/man/get_trainable_names.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/data_transforms.R \name{get_trainable_names} \alias{get_trainable_names} -\title{extract the non-key columns from epi_data} +\title{extract the non-key, non-smoothed columns from epi_data} \usage{ get_trainable_names(epi_data, cols) } @@ -12,6 +12,6 @@ get_trainable_names(epi_data, cols) \item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns} } \description{ -extract the non-key columns from epi_data +extract the non-key, non-smoothed columns from epi_data } \keyword{internal} diff --git a/man/smooth_scaled.Rd b/man/smoothed_scaled.Rd similarity index 86% rename from man/smooth_scaled.Rd rename to man/smoothed_scaled.Rd index 65698932..4a079fa4 100644 --- a/man/smooth_scaled.Rd +++ b/man/smoothed_scaled.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/forecaster_smoothed_scaled.R -\name{smooth_scaled} -\alias{smooth_scaled} +\name{smoothed_scaled} +\alias{smoothed_scaled} \title{predict on smoothed data and the standard deviation} \usage{ -smooth_scaled( +smoothed_scaled( epi_data, outcome, extra_sources = "", @@ -12,9 +12,10 @@ smooth_scaled( pop_scaling = TRUE, trainer = parsnip::linear_reg(), quantile_levels = covidhub_probs(), - smooth_days = 7, + smooth_width = 7, smooth_cols = NULL, - sd_days = 28, + sd_width = 28, + sd_mean_width = 14, sd_cols = NULL, ... ) @@ -40,17 +41,20 @@ existing examples)} \item{quantile_levels}{The quantile levels to predict. Defaults to those required by covidhub.} -\item{smooth_days}{the number of days over which to do smoothing. If \code{NULL}, +\item{smooth_width}{the number of days over which to do smoothing. If \code{NULL}, then no smoothing is applied.} \item{smooth_cols}{the names of the columns to smooth. If \code{NULL} it smooths everything} -\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes -the sd ofeverything} - \item{sd_width}{the number of days over which to take a moving average of the standard deviation. If \code{NULL}, the sd_width isn't included.} + +\item{sd_mean_width}{to calculate the sd, we need a window size for the mean +used.} + +\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes +the sd of everything} } \description{ This is a variant of \code{scaled_pop}, which predicts on a smoothed version of diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R index c9d7a929..0c61a8d8 100644 --- a/tests/testthat/test-forecasters-basics.R +++ b/tests/testthat/test-forecasters-basics.R @@ -2,7 +2,8 @@ library(dplyr) # TODO better way to do this than copypasta forecasters <- list( c("scaled_pop", scaled_pop), - c("flatline_fc", flatline_fc) + c("flatline_fc", flatline_fc), + c("smoothed_scaled", smoothed_scaled) ) for (forecaster in forecasters) { test_that(paste(forecaster[[1]], "gets the date and columns right"), { diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index ea762391..3db526b4 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -2,36 +2,41 @@ n_days <- 40 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days) rand_vals <- rnorm(n_days) epi_data <- epiprocess::as_epi_df(rbind(tibble( -geo_value = "al", -time_value = simple_dates, -a = 1:n_days, -b = rand_vals + geo_value = "al", + time_value = simple_dates, + a = 1:n_days, + b = rand_vals ), tibble( -geo_value = "ca", -time_value = simple_dates, -a = n_days:1, -b = rand_vals + 10 + geo_value = "ca", + time_value = simple_dates, + a = n_days:1, + b = rand_vals + 10 ))) test_that("rolling_mean generates correct mean", { rolled <- rolling_mean(epi_data) - expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7")) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7")) # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4 - linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1)) - expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean) + linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1)) + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), linear_roll_mean) # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 - linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1)) - expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean) + linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1)) + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean) }) test_that("rolling_sd generates correct standard deviation", { rolled <- rolling_sd(epi_data) expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28")) # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 - linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1)) + linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1)) # and the standard deviation is linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28)) expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd) # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd) - }) +}) +testthat("get_trainable_names pulls out mean and sd columns", { + rolled <- rolling_sd(epi_data, keep_mean = TRUE) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28")) + expect_equal(get_trainable_names(rolled, NULL), c("a", "b")) +}) # TODO example with NA's, example with missing days, only one column, keep_mean From 1edc6f6d5411ea370a341c40ede5f6f1eaeed212 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 13 Dec 2023 14:57:33 -0800 Subject: [PATCH 03/20] smoothed_scaled passes all forecaster tests --- NAMESPACE | 4 ++ R/data_transforms.R | 60 +++++++++++++++++++++++- R/epipredict_utilities.R | 8 ++-- R/forecaster_smoothed_scaled.R | 4 +- man/arx_preprocess.Rd | 6 +-- man/cache_metadata.Rd | 11 +++++ man/get_nonkey_names.Rd | 14 ++++++ man/update_predictors.Rd | 16 +++++++ tests/testthat/test-forecasters-basics.R | 1 + tests/testthat/test-transforms.R | 19 +++++++- 10 files changed, 131 insertions(+), 12 deletions(-) create mode 100644 man/cache_metadata.Rd create mode 100644 man/get_nonkey_names.Rd create mode 100644 man/update_predictors.Rd diff --git a/NAMESPACE b/NAMESPACE index f65be2b1..8b1721f6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(single_id) export(slide_forecaster) export(smoothed_scaled) export(underprediction) +export(update_predictors) export(weighted_interval_score) importFrom(assertthat,assert_that) importFrom(cli,cli_abort) @@ -88,9 +89,12 @@ importFrom(epiprocess,epix_slide) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(purrr,imap) +importFrom(purrr,list_modify) importFrom(purrr,map) importFrom(purrr,map2_vec) +importFrom(purrr,map_chr) importFrom(purrr,map_vec) +importFrom(purrr,reduce) importFrom(purrr,transpose) importFrom(recipes,all_numeric) importFrom(rlang,"!!") diff --git a/R/data_transforms.R b/R/data_transforms.R index 43164f4a..04a71494 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -6,14 +6,48 @@ #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns get_trainable_names <- function(epi_data, cols) { if (is.null(cols)) { - cols <- names(epi_data) - cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] + cols <- get_nonkey_names(epi_data) # exclude anything with the same naming schema as the rolling average/sd created below cols <- cols[!grepl("_\\w{1,2}\\d+", cols)] } return(cols) } +#' just the names which aren't keys for an epi_df +#' @description +#' names, but it excludes keys +#' @param epi_data the epi_df +get_nonkey_names <- function(epi_data) { + cols <- names(epi_data) + cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] +} + + +#' update the predictors to only contain the smoothed/sd versions of cols +#' @description +#' should only be applied after both rolling_mean and rolling_sd +#' @param epi_data the epi_df +#' @param cols the list of columns +#' @importFrom purrr map map_chr reduce +#' @export +update_predictors <- function(epi_data, cols_modified, predictors) { + if (!is.null(cols_modified)) { + # if cols_modified isn't null, make sure we include predictors that weren't modified + other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`) + other_predictors <- predictors[other_predictors] + } else { + other_predictors <- c() + } + # all the non-key names + col_names <- get_nonkey_names(epi_data) + is_present <- function(x) { + grepl(x, col_names) & !(col_names %in% predictors) + } + is_modified <- map(predictors, is_present) %>% reduce(`|`) + new_predictors <- col_names[is_modified] + return(c(other_predictors, new_predictors)) +} + #' get a rolling average for the named columns #' @description #' add column(s) that are the rolling means of the specified columns, as @@ -36,6 +70,25 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { return(epi_data) } +#' store the metadata in a easy to reapply way +#' @importFrom purrr list_modify +cache_metadata <- function(epi_data) { + features <- list() + all_others <- attributes(epi_data)$metadata + all_others["geo_type"] <- NULL + all_others["time_type"] <- NULL + all_others["as_of"] <- NULL + if (length(all_others) == 0) { + all_others <- list() + } + features <- list( + as_of = attributes(epi_data)$metadata$as_of, + geo_type = attributes(epi_data)$metadata$geo_type, + time_type = attributes(epi_data)$metadata$time_type, all_others = all_others + ) + return(features) +} + #' get a rolling standard deviation for the named columns #' @description #' A rolling standard deviation, based off of a rolling mean. First it @@ -56,6 +109,7 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = mean_width <- as.integer(ceiling(sd_width / 2)) } cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) + metadata <- cache_metadata(epi_data) epi_data %<>% group_by(geo_value) for (col in cols_to_sd) { mean_name <- paste0(col, "_m", mean_width) @@ -63,8 +117,10 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) if (!keep_mean) { + # TODO make sure the extra info sticks around epi_data %<>% select(-{{ mean_name }}) } + epi_data %<>% as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others) } epi_data %<>% ungroup() return(epi_data) diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R index fc56925f..9e826273 100644 --- a/R/epipredict_utilities.R +++ b/R/epipredict_utilities.R @@ -9,18 +9,18 @@ #' @seealso [arx_postprocess] for the layer equivalent #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window #' @export -arx_preprocess <- function(rec, outcome, predictors, args_list) { +arx_preprocess <- function(preproc, outcome, predictors, args_list) { # input already validated lags <- args_list$lags for (l in seq_along(lags)) { p <- predictors[l] - rec %<>% step_epi_lag(!!p, lag = lags[[l]]) + preproc %<>% step_epi_lag(!!p, lag = lags[[l]]) } - rec %<>% + preproc %<>% step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) - return(rec) + return(preproc) } # TODO replace with `layer_arx_forecaster` diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R index 0edbaa78..12e93315 100644 --- a/R/forecaster_smoothed_scaled.R +++ b/R/forecaster_smoothed_scaled.R @@ -100,8 +100,8 @@ smoothed_scaled <- function(epi_data, keep_mean = keep_mean ) } - # even - + # and need to make sure we exclude the original varialbes as predictors + predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors) # preprocessing supported by epipredict preproc <- epi_recipe(epi_data) if (pop_scaling) { diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd index 35e2e0b0..1b24a41c 100644 --- a/man/arx_preprocess.Rd +++ b/man/arx_preprocess.Rd @@ -4,16 +4,16 @@ \alias{arx_preprocess} \title{add the default steps for arx_forecaster} \usage{ -arx_preprocess(rec, outcome, predictors, args_list) +arx_preprocess(preproc, outcome, predictors, args_list) } \arguments{ -\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} - \item{outcome}{a character of the column to be predicted} \item{predictors}{a character vector of the columns used as predictors} \item{args_list}{an \code{\link[epipredict:arx_args_list]{epipredict::arx_args_list}}} + +\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} } \description{ add the default steps for arx_forecaster diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd new file mode 100644 index 00000000..14a6d64b --- /dev/null +++ b/man/cache_metadata.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{cache_metadata} +\alias{cache_metadata} +\title{store the metadata in a easy to reapply way} +\usage{ +cache_metadata(epi_data) +} +\description{ +store the metadata in a easy to reapply way +} diff --git a/man/get_nonkey_names.Rd b/man/get_nonkey_names.Rd new file mode 100644 index 00000000..242ebabd --- /dev/null +++ b/man/get_nonkey_names.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{get_nonkey_names} +\alias{get_nonkey_names} +\title{just the names which aren't keys for an epi_df} +\usage{ +get_nonkey_names(epi_data) +} +\arguments{ +\item{epi_data}{the epi_df} +} +\description{ +names, but it excludes keys +} diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd new file mode 100644 index 00000000..e7d55b21 --- /dev/null +++ b/man/update_predictors.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transforms.R +\name{update_predictors} +\alias{update_predictors} +\title{update the predictors to only contain the smoothed/sd versions of cols} +\usage{ +update_predictors(epi_data, cols_modified, predictors) +} +\arguments{ +\item{epi_data}{the epi_df} + +\item{cols}{the list of columns} +} +\description{ +should only be applied after both rolling_mean and rolling_sd +} diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R index 0c61a8d8..785b6ad2 100644 --- a/tests/testthat/test-forecasters-basics.R +++ b/tests/testthat/test-forecasters-basics.R @@ -5,6 +5,7 @@ forecasters <- list( c("flatline_fc", flatline_fc), c("smoothed_scaled", smoothed_scaled) ) +forecaster <- forecasters[[3]] for (forecaster in forecasters) { test_that(paste(forecaster[[1]], "gets the date and columns right"), { jhu <- epipredict::case_death_rate_subset %>% diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index 3db526b4..96fc0df6 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -21,6 +21,7 @@ test_that("rolling_mean generates correct mean", { # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1)) expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean) + expect_true("epi_df" %in% class(rolled)) }) test_that("rolling_sd generates correct standard deviation", { @@ -33,10 +34,26 @@ test_that("rolling_sd generates correct standard deviation", { expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd) # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd) + # doesn't break types + expect_true("epi_df" %in% class(rolled)) }) -testthat("get_trainable_names pulls out mean and sd columns", { + +test_that("get_trainable_names pulls out mean and sd columns", { rolled <- rolling_sd(epi_data, keep_mean = TRUE) expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28")) expect_equal(get_trainable_names(rolled, NULL), c("a", "b")) }) # TODO example with NA's, example with missing days, only one column, keep_mean + +test_that("update_predictors keeps unmodified predictors", { + epi_data["c"] = NaN + epi_data["d"] = NaN + epi_data["b_m14"] = NaN + epi_data["b_SD28"] = NaN + predictors <- c("a", "b", "c") # everything but d + modified <- c("b", "c") # we want to exclude b but not its modified versions + expected_predictors <- c("a", "b_m14", "b_SD28") + expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors) + expected_if_all_modified <- c("b_m14", "b_SD28") + expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified) +}) From e63be8924f437492458eb051c9ece4cfbc87807e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 13 Dec 2023 15:35:43 -0800 Subject: [PATCH 04/20] smoothed_scaled data tests --- tests/testthat/test-forecasters-data.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-forecasters-data.R b/tests/testthat/test-forecasters-data.R index 365811dd..d12b4a40 100644 --- a/tests/testthat/test-forecasters-data.R +++ b/tests/testthat/test-forecasters-data.R @@ -6,8 +6,10 @@ forecasters <- tibble::tribble( ~forecaster, ~extra_params, ~extra_params_names, ~fc_name, scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"), "scaled_pop", scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"), "scaled_pop", - flatline_fc, list(1), list("ahead"), "flatline_fc" + flatline_fc, list(1), list("ahead"), "flatline_fc", + smoothed_scaled, list(1), list("ahead"), "smoothed_scaled" ) +expects_nonequal <- c("scaled_pop", "smoothed_scaled") synth_mean <- 25 synth_sd <- 2 tiny_sd <- 1.0e-5 @@ -49,7 +51,7 @@ different_constants <- epiprocess::as_epi_archive(rbind( )) for (ii in 1:nrow(forecasters)) { test_that(paste(forecasters$fc_name[[ii]], " predicts a constant median for constant data"), { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(different_constants, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(different_constants, ii) @@ -126,7 +128,7 @@ missing_state <- epiprocess::as_epi_archive(rbind( )) for (ii in seq_len(nrow(forecasters))) { test_that(paste(forecasters$fc_name[[ii]], "predicts well in the presence of only one state with variably delayed data"), { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(missing_state, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(missing_state, ii) @@ -171,7 +173,7 @@ for (ii in seq_len(nrow(forecasters))) { test_that(paste(forecasters$fc_name[[ii]], "predicts a linear increasing slope correctly"), { # flatline will definitely fail this, so it's exempt if (!identical(forecasters$forecaster[[ii]], flatline_fc)) { - if (forecasters$fc_name[[ii]] == "scaled_pop") { + if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) { suppressWarnings(expect_warning(res <- get_pred(linear, ii), regexp = "prediction from rank-deficient fit")) } else { res <- get_pred(linear, ii) From ea335c9f418d19d81ccbdfc734d2efa2a4eb6838 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 14 Dec 2023 10:42:48 -0800 Subject: [PATCH 05/20] docfix, points no longer oversized --- R/epipredict_utilities.R | 2 +- app.R | 2 +- man/arx_preprocess.Rd | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R index 9e826273..083ce35f 100644 --- a/R/epipredict_utilities.R +++ b/R/epipredict_utilities.R @@ -2,7 +2,7 @@ #' add the default steps for arx_forecaster #' @description #' add the default steps for arx_forecaster -#' @param rec an [`epipredict::epi_recipe`] +#' @param preproc an [`epipredict::epi_recipe`] #' @param outcome a character of the column to be predicted #' @param predictors a character vector of the columns used as predictors #' @param args_list an [`epipredict::arx_args_list`] diff --git a/app.R b/app.R index c5ecf1a2..ef60b3c3 100644 --- a/app.R +++ b/app.R @@ -261,7 +261,7 @@ shinyApp( # Use scatterplot or lines depending on the x var. { if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) { - . + geom_point(aes(size = n)) + expand_limits(size = 0) + . + geom_point(aes(size = n/100)) + expand_limits(size = 0) } else { . + geom_line() } diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd index 1b24a41c..44ea895a 100644 --- a/man/arx_preprocess.Rd +++ b/man/arx_preprocess.Rd @@ -7,13 +7,13 @@ arx_preprocess(preproc, outcome, predictors, args_list) } \arguments{ +\item{preproc}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} + \item{outcome}{a character of the column to be predicted} \item{predictors}{a character vector of the columns used as predictors} \item{args_list}{an \code{\link[epipredict:arx_args_list]{epipredict::arx_args_list}}} - -\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}} } \description{ add the default steps for arx_forecaster From 8df5f05ce513453d55dc8c26e8f49e4a08036a9a Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 14 Dec 2023 11:00:28 -0800 Subject: [PATCH 06/20] docs only tell one thing at a time --- R/data_transforms.R | 6 +++++- app.R | 4 ++-- man/cache_metadata.Rd | 3 +++ man/update_predictors.Rd | 4 +++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/R/data_transforms.R b/R/data_transforms.R index 04a71494..8f429f90 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -27,7 +27,8 @@ get_nonkey_names <- function(epi_data) { #' @description #' should only be applied after both rolling_mean and rolling_sd #' @param epi_data the epi_df -#' @param cols the list of columns +#' @param cols_modified the list of columns +#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced #' @importFrom purrr map map_chr reduce #' @export update_predictors <- function(epi_data, cols_modified, predictors) { @@ -71,6 +72,9 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { } #' store the metadata in a easy to reapply way +#' @description +#' store the metadata in a easy to reapply way +#' @param epi_data the epi_df #' @importFrom purrr list_modify cache_metadata <- function(epi_data) { features <- list() diff --git a/app.R b/app.R index ef60b3c3..064c4ffe 100644 --- a/app.R +++ b/app.R @@ -170,7 +170,7 @@ shinyApp( verticalLayout( plotlyOutput("main_plot", height = "90em"), h2("forecaster name -> parameters"), - #textOutput("forecaster_param_title"), + # textOutput("forecaster_param_title"), dataTableOutput("forecaster_table"), h2("ensemble name -> parameters"), dataTableOutput("ensemble_table") @@ -261,7 +261,7 @@ shinyApp( # Use scatterplot or lines depending on the x var. { if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) { - . + geom_point(aes(size = n/100)) + expand_limits(size = 0) + . + geom_point(aes(size = n / 100)) + expand_limits(size = 0) } else { . + geom_line() } diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd index 14a6d64b..1dde416f 100644 --- a/man/cache_metadata.Rd +++ b/man/cache_metadata.Rd @@ -6,6 +6,9 @@ \usage{ cache_metadata(epi_data) } +\arguments{ +\item{epi_data}{the epi_df} +} \description{ store the metadata in a easy to reapply way } diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd index e7d55b21..9b9e603c 100644 --- a/man/update_predictors.Rd +++ b/man/update_predictors.Rd @@ -9,7 +9,9 @@ update_predictors(epi_data, cols_modified, predictors) \arguments{ \item{epi_data}{the epi_df} -\item{cols}{the list of columns} +\item{cols_modified}{the list of columns} + +\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced} } \description{ should only be applied after both rolling_mean and rolling_sd From 36608f6a917dafb105c9a03ce841be13a039d5a1 Mon Sep 17 00:00:00 2001 From: David Weber Date: Tue, 19 Dec 2023 14:26:29 -0800 Subject: [PATCH 07/20] Update R/data_transforms.R Co-authored-by: brookslogan --- R/data_transforms.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/R/data_transforms.R b/R/data_transforms.R index 8f429f90..1434d3d0 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -113,19 +113,18 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = mean_width <- as.integer(ceiling(sd_width / 2)) } cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) - metadata <- cache_metadata(epi_data) - epi_data %<>% group_by(geo_value) + result <- epi_data + result %<>% group_by(geo_value) for (col in cols_to_sd) { mean_name <- paste0(col, "_m", mean_width) sd_name <- paste0(col, "_SD", sd_width) - epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) - epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) + result %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) + result %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) if (!keep_mean) { # TODO make sure the extra info sticks around - epi_data %<>% select(-{{ mean_name }}) + result %<>% select(-{{ mean_name }}) } - epi_data %<>% as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others) } - epi_data %<>% ungroup() - return(epi_data) + result %<>% ungroup() + result %<>% dplyr_reconstruct(epi_data) } From 0eb61e3adf0cd24ccacf3cf8511acdee981510ff Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 19 Dec 2023 14:37:39 -0800 Subject: [PATCH 08/20] fix: warnings are one at a time, apparently --- R/forecaster_smoothed_scaled.R | 1 + man/smoothed_scaled.Rd | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R index 12e93315..becfb3a3 100644 --- a/R/forecaster_smoothed_scaled.R +++ b/R/forecaster_smoothed_scaled.R @@ -27,6 +27,7 @@ #' @param sd_cols the names of the columns to smooth. If `NULL` its includes #' the sd of everything #' @param quantile_levels The quantile levels to predict. Defaults to those +#' @param ... any additional arguments as used by [arx_args_list] #' required by covidhub. #' @seealso some utilities for making forecasters: [format_storage], #' [perform_sanity_checks] diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd index 4a079fa4..683c56d7 100644 --- a/man/smoothed_scaled.Rd +++ b/man/smoothed_scaled.Rd @@ -38,8 +38,7 @@ existing examples)} \item{trainer}{an example extra parameter that is fairly common} -\item{quantile_levels}{The quantile levels to predict. Defaults to those -required by covidhub.} +\item{quantile_levels}{The quantile levels to predict. Defaults to those} \item{smooth_width}{the number of days over which to do smoothing. If \code{NULL}, then no smoothing is applied.} @@ -55,6 +54,9 @@ used.} \item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes the sd of everything} + +\item{...}{any additional arguments as used by \link{arx_args_list} +required by covidhub.} } \description{ This is a variant of \code{scaled_pop}, which predicts on a smoothed version of From c50249b42560be4bde440f6ed838f2a7ecf78173 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 20 Dec 2023 11:05:04 -0800 Subject: [PATCH 09/20] switch to epi_slide, add logan's suggestions, NA tests --- NAMESPACE | 3 +- R/data_transforms.R | 37 ++++-------------- man/cache_metadata.Rd | 14 ------- tests/testthat/test-transforms.R | 67 ++++++++++++++++++++++++-------- 4 files changed, 59 insertions(+), 62 deletions(-) delete mode 100644 man/cache_metadata.Rd diff --git a/NAMESPACE b/NAMESPACE index 8b1721f6..c4dd11ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,11 +85,11 @@ importFrom(epipredict,step_epi_naomit) importFrom(epipredict,step_population_scaling) importFrom(epipredict,step_training_window) importFrom(epiprocess,as_epi_df) +importFrom(epiprocess,epi_slide) importFrom(epiprocess,epix_slide) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(purrr,imap) -importFrom(purrr,list_modify) importFrom(purrr,map) importFrom(purrr,map2_vec) importFrom(purrr,map_chr) @@ -103,7 +103,6 @@ importFrom(rlang,.data) importFrom(rlang,quo) importFrom(rlang,sym) importFrom(rlang,syms) -importFrom(slider,slide2_dbl) importFrom(slider,slide_dbl) importFrom(targets,tar_config_get) importFrom(targets,tar_group) diff --git a/R/data_transforms.R b/R/data_transforms.R index 1434d3d0..b06f8779 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -59,40 +59,19 @@ update_predictors <- function(epi_data, cols_modified, predictors) { #' @param width the number of days (or examples, the sliding isn't time-aware) to use #' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all #' @importFrom slider slide_dbl +#' @importFrom epiprocess epi_slide #' @export rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { cols_to_mean <- get_trainable_names(epi_data, cols_to_mean) epi_data %<>% group_by(geo_value) for (col in cols_to_mean) { mean_name <- paste0(col, "_m", width) - epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width)) + epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width, new_col_name = mean_name) } epi_data %<>% ungroup() return(epi_data) } -#' store the metadata in a easy to reapply way -#' @description -#' store the metadata in a easy to reapply way -#' @param epi_data the epi_df -#' @importFrom purrr list_modify -cache_metadata <- function(epi_data) { - features <- list() - all_others <- attributes(epi_data)$metadata - all_others["geo_type"] <- NULL - all_others["time_type"] <- NULL - all_others["as_of"] <- NULL - if (length(all_others) == 0) { - all_others <- list() - } - features <- list( - as_of = attributes(epi_data)$metadata$as_of, - geo_type = attributes(epi_data)$metadata$geo_type, - time_type = attributes(epi_data)$metadata$time_type, all_others = all_others - ) - return(features) -} - #' get a rolling standard deviation for the named columns #' @description #' A rolling standard deviation, based off of a rolling mean. First it @@ -106,7 +85,7 @@ cache_metadata <- function(epi_data) { #' (so 14 in the complete default case) #' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all #' @param keep_mean bool, if `TRUE`, it retains keeps the mean column -#' @importFrom slider slide_dbl slide2_dbl +#' @importFrom epiprocess epi_slide #' @export rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) { if (is.null(mean_width)) { @@ -114,17 +93,17 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = } cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) result <- epi_data - result %<>% group_by(geo_value) for (col in cols_to_sd) { + result %<>% group_by(geo_value) mean_name <- paste0(col, "_m", mean_width) - sd_name <- paste0(col, "_SD", sd_width) - result %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) - result %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) + sd_name <- paste0(col, "_sd", sd_width) + result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width, new_col_name = mean_name) + result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width, new_col_name = sd_name) if (!keep_mean) { # TODO make sure the extra info sticks around result %<>% select(-{{ mean_name }}) } + result %<>% dplyr_reconstruct(epi_data) } result %<>% ungroup() - result %<>% dplyr_reconstruct(epi_data) } diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd deleted file mode 100644 index 1dde416f..00000000 --- a/man/cache_metadata.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_transforms.R -\name{cache_metadata} -\alias{cache_metadata} -\title{store the metadata in a easy to reapply way} -\usage{ -cache_metadata(epi_data) -} -\arguments{ -\item{epi_data}{the epi_df} -} -\description{ -store the metadata in a easy to reapply way -} diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index 96fc0df6..73b522ff 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -1,46 +1,79 @@ +library(dplyr) n_days <- 40 +removed_date <- 10 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days) -rand_vals <- rnorm(n_days) +simple_dates <- simple_dates[-removed_date] +rand_vals <- rnorm(n_days-1) + +# Two states, with 2 variables. a is linear, going up in one state and down in the other +# b is just random +# note that day 10 is missing epi_data <- epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = simple_dates, - a = 1:n_days, + a = 1:(n_days-1), b = rand_vals ), tibble( geo_value = "ca", time_value = simple_dates, - a = n_days:1, + a = (n_days-1):1, b = rand_vals + 10 ))) test_that("rolling_mean generates correct mean", { rolled <- rolling_mean(epi_data) + rolled expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7")) # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4 - linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1)) - expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), linear_roll_mean) + linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 35.5, by = 1)) + # day 10 is missing, so days 11-18 are thrown off + lag_st <- 10 + unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st+1):(lag_st+1-6))), mean(c((lag_st+2):(lag_st+2-6))), mean(c((lag_st+3):(lag_st+3-6))), mean(c((lag_st+4):(lag_st+4-6))), mean(c((lag_st+5):(lag_st+5-6))), mean(c((lag_st+6):(lag_st+6-6)))) + # stitching the lag induced hiccup into the "normal" mean values + expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[17:(n_days-1)]) + + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), expected_mean) # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 - linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1)) - expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean) + linear_reverse_roll_mean <- c(seq(from = 39, to = 35.5, by = -0.5), seq(from = 34.5, to = 4.5, by = -1)) + lag_st <- 36 + # day 10 is missing, so days 11-18 are thrown off + unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st-1):(lag_st-1-6))), mean(c((lag_st-2):(lag_st-2-6))), mean(c((lag_st-3):(lag_st-3-6))), mean(c((lag_st-4):(lag_st-4-6))), mean(c((lag_st-5):(lag_st-5-6))), mean(c((lag_st-6):(lag_st-6-6)))) + # stitching the lag induced hiccup into the "normal" mean values + expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[17:(n_days-1)]) + # actually testing + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), expected_mean) + + # making sure the type is maintained expect_true("epi_df" %in% class(rolled)) }) test_that("rolling_sd generates correct standard deviation", { - rolled <- rolling_sd(epi_data) - expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28")) + rolled <- rolling_sd(epi_data,keep_mean = TRUE) + rolled %>% filter(geo_value == "al") %>% pull("a_m14") + rolled %>% filter(geo_value == "al") + rolled %>% filter(geo_value == "al") %>% pull("a") + rolled %>% filter(geo_value == "al") %>% pull("a_sd28") + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28")) # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 - linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1)) + linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1)) + linear_roll_mean + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean) # and the standard deviation is - linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28)) - expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd) + linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date-1) + linear_values <- 1:39 + linear_values <- append(linear_values, NA, after = removed_date-1) + linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28)) + # drop the extra date caused by the inclusion of the NAs + linear_roll_sd <- linear_roll_sd[-(removed_date)] + expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd) # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values - expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd) + expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_sd28"), linear_roll_sd) # doesn't break types expect_true("epi_df" %in% class(rolled)) }) test_that("get_trainable_names pulls out mean and sd columns", { rolled <- rolling_sd(epi_data, keep_mean = TRUE) - expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28")) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) expect_equal(get_trainable_names(rolled, NULL), c("a", "b")) }) # TODO example with NA's, example with missing days, only one column, keep_mean @@ -49,11 +82,11 @@ test_that("update_predictors keeps unmodified predictors", { epi_data["c"] = NaN epi_data["d"] = NaN epi_data["b_m14"] = NaN - epi_data["b_SD28"] = NaN + epi_data["b_sd28"] = NaN predictors <- c("a", "b", "c") # everything but d modified <- c("b", "c") # we want to exclude b but not its modified versions - expected_predictors <- c("a", "b_m14", "b_SD28") + expected_predictors <- c("a", "b_m14", "b_sd28") expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors) - expected_if_all_modified <- c("b_m14", "b_SD28") + expected_if_all_modified <- c("b_m14", "b_sd28") expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified) }) From 0b16c818336f546e5c8e6638851194f46b67ee83 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 20 Dec 2023 11:43:15 -0800 Subject: [PATCH 10/20] test: fix updated assumption --- tests/testthat/test-transforms.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index 73b522ff..7976984e 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -48,11 +48,7 @@ test_that("rolling_mean generates correct mean", { test_that("rolling_sd generates correct standard deviation", { rolled <- rolling_sd(epi_data,keep_mean = TRUE) - rolled %>% filter(geo_value == "al") %>% pull("a_m14") - rolled %>% filter(geo_value == "al") - rolled %>% filter(geo_value == "al") %>% pull("a") - rolled %>% filter(geo_value == "al") %>% pull("a_sd28") - expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28")) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1)) linear_roll_mean From 514219e79aae725b1f19406049c1fb570898e890 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 20 Dec 2023 11:55:34 -0800 Subject: [PATCH 11/20] test: make sure keep is off by default --- tests/testthat/test-transforms.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index 7976984e..4020d437 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -86,3 +86,8 @@ test_that("update_predictors keeps unmodified predictors", { expected_if_all_modified <- c("b_m14", "b_sd28") expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified) }) + +test_that("rolling_sd doesn't keep the mean columns by default", { + rolled <- rolling_sd(epi_data) + expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28")) +}) From b976103d758d45375b4c4a814ec1a7d667405425 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 20 Dec 2023 13:36:03 -0800 Subject: [PATCH 12/20] docs: slightly better --- R/data_transforms.R | 11 +++++++---- R/targets_utils.R | 9 ++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/R/data_transforms.R b/R/data_transforms.R index b06f8779..ecf6fcf7 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -2,7 +2,7 @@ #' extract the non-key, non-smoothed columns from epi_data #' @keywords internal -#' @param epi_data the epi_data tibble +#' @param epi_data the `epi_df` #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns get_trainable_names <- function(epi_data, cols) { if (is.null(cols)) { @@ -20,14 +20,17 @@ get_trainable_names <- function(epi_data, cols) { get_nonkey_names <- function(epi_data) { cols <- names(epi_data) cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] + return(cols) } #' update the predictors to only contain the smoothed/sd versions of cols #' @description -#' should only be applied after both rolling_mean and rolling_sd +#' modifies the list of preditors so that any which have been modified have the +#' modified versions included, and not the original. Should only be applied +#' after both rolling_mean and rolling_sd. #' @param epi_data the epi_df -#' @param cols_modified the list of columns +#' @param cols_modified the list of columns to modify. If this is `NULL`, that means we were modifying every column. #' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced #' @importFrom purrr map map_chr reduce #' @export @@ -37,7 +40,7 @@ update_predictors <- function(epi_data, cols_modified, predictors) { other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`) other_predictors <- predictors[other_predictors] } else { - other_predictors <- c() + other_predictors <- character(0L) } # all the non-key names col_names <- get_nonkey_names(epi_data) diff --git a/R/targets_utils.R b/R/targets_utils.R index feac27c2..1743e5da 100644 --- a/R/targets_utils.R +++ b/R/targets_utils.R @@ -138,7 +138,14 @@ make_shared_grids <- function() { ), tidyr::expand_grid( forecaster = "flatline_fc", - ahead = 1:7 + ahead = c(1:7, 14, 21, 28) + ), + tidyr::expand_grid( + forecaster = "smoothed_scaled", + trainer = c("quantreg"), + ahead = c(1:7, 14, 21, 28), + lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)), + pop_scaling = c(FALSE) ) ) } From c4c7430aea12249948c29dcf4e29c5da99b07780 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 21 Dec 2023 15:29:47 -0800 Subject: [PATCH 13/20] continuing to clarify update_predictors --- R/data_transforms.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/data_transforms.R b/R/data_transforms.R index ecf6fcf7..1ccb3ec9 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -29,27 +29,29 @@ get_nonkey_names <- function(epi_data) { #' modifies the list of preditors so that any which have been modified have the #' modified versions included, and not the original. Should only be applied #' after both rolling_mean and rolling_sd. -#' @param epi_data the epi_df -#' @param cols_modified the list of columns to modify. If this is `NULL`, that means we were modifying every column. -#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced +#' @param epi_data the epi_df, only included to get the non-key column names +#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column. +#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17"). #' @importFrom purrr map map_chr reduce +#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact. #' @export update_predictors <- function(epi_data, cols_modified, predictors) { if (!is.null(cols_modified)) { # if cols_modified isn't null, make sure we include predictors that weren't modified - other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`) - other_predictors <- predictors[other_predictors] + unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`) + unchanged_predictors <- predictors[unchanged_predictors] } else { - other_predictors <- character(0L) + # if it's null, we've modified every predictor + unchanged_predictors <- character(0L) } # all the non-key names col_names <- get_nonkey_names(epi_data) - is_present <- function(x) { - grepl(x, col_names) & !(col_names %in% predictors) + is_present <- function(original_predictor) { + grepl(original_predictor, col_names) & !(col_names %in% predictors) } is_modified <- map(predictors, is_present) %>% reduce(`|`) new_predictors <- col_names[is_modified] - return(c(other_predictors, new_predictors)) + return(c(unchanged_predictors, new_predictors)) } #' get a rolling average for the named columns From d6d4cddda61508ec630fe4b89ade3d054604d041 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 10:46:28 -0800 Subject: [PATCH 14/20] fix before behavior: mean tests simplified --- R/data_transforms.R | 6 ++--- tests/testthat/test-transforms.R | 42 +++++++++++++++++--------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/R/data_transforms.R b/R/data_transforms.R index 1ccb3ec9..58eb306e 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -71,7 +71,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { epi_data %<>% group_by(geo_value) for (col in cols_to_mean) { mean_name <- paste0(col, "_m", width) - epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width, new_col_name = mean_name) + epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width-1L, new_col_name = mean_name) } epi_data %<>% ungroup() return(epi_data) @@ -102,8 +102,8 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = result %<>% group_by(geo_value) mean_name <- paste0(col, "_m", mean_width) sd_name <- paste0(col, "_sd", sd_width) - result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width, new_col_name = mean_name) - result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width, new_col_name = sd_name) + result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width-1L, new_col_name = mean_name) + result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width-1, new_col_name = sd_name) if (!keep_mean) { # TODO make sure the extra info sticks around result %<>% select(-{{ mean_name }}) diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index 4020d437..e6b79ed8 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -3,7 +3,7 @@ n_days <- 40 removed_date <- 10 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days) simple_dates <- simple_dates[-removed_date] -rand_vals <- rnorm(n_days-1) +rand_vals <- rnorm(n_days - 1) # Two states, with 2 variables. a is linear, going up in one state and down in the other # b is just random @@ -11,12 +11,12 @@ rand_vals <- rnorm(n_days-1) epi_data <- epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = simple_dates, - a = 1:(n_days-1), + a = 1:(n_days - 1), b = rand_vals ), tibble( geo_value = "ca", time_value = simple_dates, - a = (n_days-1):1, + a = (n_days - 1):1, b = rand_vals + 10 ))) test_that("rolling_mean generates correct mean", { @@ -24,21 +24,23 @@ test_that("rolling_mean generates correct mean", { rolled expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7")) # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4 - linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 35.5, by = 1)) - # day 10 is missing, so days 11-18 are thrown off - lag_st <- 10 - unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st+1):(lag_st+1-6))), mean(c((lag_st+2):(lag_st+2-6))), mean(c((lag_st+3):(lag_st+3-6))), mean(c((lag_st+4):(lag_st+4-6))), mean(c((lag_st+5):(lag_st+5-6))), mean(c((lag_st+6):(lag_st+6-6)))) + linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 7), seq(from = 5, to = 36, by = 1)) + # day 10 is missing, so the average days 11-16 are thrown off, only using 6 values instead of 7 + gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a) + unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):((gap_starts + d) - 5))) # stitching the lag induced hiccup into the "normal" mean values - expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[17:(n_days-1)]) + expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[16:(n_days - 1)]) + expected_mean expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), expected_mean) + # Doing the same for California # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5 - linear_reverse_roll_mean <- c(seq(from = 39, to = 35.5, by = -0.5), seq(from = 34.5, to = 4.5, by = -1)) - lag_st <- 36 - # day 10 is missing, so days 11-18 are thrown off - unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st-1):(lag_st-1-6))), mean(c((lag_st-2):(lag_st-2-6))), mean(c((lag_st-3):(lag_st-3-6))), mean(c((lag_st-4):(lag_st-4-6))), mean(c((lag_st-5):(lag_st-5-6))), mean(c((lag_st-6):(lag_st-6-6)))) + linear_reverse_roll_mean <- c(seq(from = 39, by = -0.5, length.out = 7), seq(from = 35, to = 4, by = -1)) + # day 10 is missing, so days 11-16 are thrown off + gap_starts <- epi_data %>% filter(geo_value == "ca" & time_value == as.Date("2012-01-11")) %>% pull(a) + unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts - d) + 0):((gap_starts - d) + 5))) # stitching the lag induced hiccup into the "normal" mean values - expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[17:(n_days-1)]) + expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[16:(n_days - 1)]) # actually testing expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), expected_mean) @@ -47,16 +49,16 @@ test_that("rolling_mean generates correct mean", { }) test_that("rolling_sd generates correct standard deviation", { - rolled <- rolling_sd(epi_data,keep_mean = TRUE) + rolled <- rolling_sd(epi_data, keep_mean = TRUE) expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1)) linear_roll_mean expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean) # and the standard deviation is - linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date-1) + linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1) linear_values <- 1:39 - linear_values <- append(linear_values, NA, after = removed_date-1) + linear_values <- append(linear_values, NA, after = removed_date - 1) linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28)) # drop the extra date caused by the inclusion of the NAs linear_roll_sd <- linear_roll_sd[-(removed_date)] @@ -75,10 +77,10 @@ test_that("get_trainable_names pulls out mean and sd columns", { # TODO example with NA's, example with missing days, only one column, keep_mean test_that("update_predictors keeps unmodified predictors", { - epi_data["c"] = NaN - epi_data["d"] = NaN - epi_data["b_m14"] = NaN - epi_data["b_sd28"] = NaN + epi_data["c"] <- NaN + epi_data["d"] <- NaN + epi_data["b_m14"] <- NaN + epi_data["b_sd28"] <- NaN predictors <- c("a", "b", "c") # everything but d modified <- c("b", "c") # we want to exclude b but not its modified versions expected_predictors <- c("a", "b_m14", "b_sd28") From d542be27e21418b201e802d6ec6be027db9657bc Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 12:08:35 -0800 Subject: [PATCH 15/20] various suggestions from logan, before=n_points-1 --- NAMESPACE | 1 + R/data_transforms.R | 6 +- R/forecaster_scaled_pop.R | 2 +- R/forecaster_smoothed_scaled.R | 72 +++++++++++++----------- R/targets_utils.R | 4 +- man/get_trainable_names.Rd | 2 +- man/smoothed_scaled.Rd | 13 ++++- man/update_predictors.Rd | 13 +++-- tests/testthat/test-forecasters-basics.R | 1 - tests/testthat/test-transforms.R | 11 +++- 10 files changed, 75 insertions(+), 50 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c4dd11ea..e6f711c3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -113,3 +113,4 @@ importFrom(tidyr,drop_na) importFrom(tidyr,expand_grid) importFrom(tidyr,pivot_wider) importFrom(tidyr,unnest) +importFrom(zeallot,"%<-%") diff --git a/R/data_transforms.R b/R/data_transforms.R index 58eb306e..7329823d 100644 --- a/R/data_transforms.R +++ b/R/data_transforms.R @@ -71,7 +71,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { epi_data %<>% group_by(geo_value) for (col in cols_to_mean) { mean_name <- paste0(col, "_m", width) - epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width-1L, new_col_name = mean_name) + epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name) } epi_data %<>% ungroup() return(epi_data) @@ -102,8 +102,8 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = result %<>% group_by(geo_value) mean_name <- paste0(col, "_m", mean_width) sd_name <- paste0(col, "_sd", sd_width) - result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width-1L, new_col_name = mean_name) - result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width-1, new_col_name = sd_name) + result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name) + result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name) if (!keep_mean) { # TODO make sure the extra info sticks around result %<>% select(-{{ mean_name }}) diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R index 6db0b8b0..2b5d0020 100644 --- a/R/forecaster_scaled_pop.R +++ b/R/forecaster_scaled_pop.R @@ -73,7 +73,7 @@ scaled_pop <- function(epi_data, args_input[["ahead"]] <- effective_ahead args_input[["quantile_levels"]] <- quantile_levels args_list <- do.call(arx_args_list, args_input) - # if you want to ignore extra_sources, setting predictors is the way to do it + # if you want to hardcode particular predictors in a particular forecaster predictors <- c(outcome, extra_sources) # TODO: Partial match quantile_level coming from here (on Dmitry's machine) argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R index becfb3a3..b1c02275 100644 --- a/R/forecaster_smoothed_scaled.R +++ b/R/forecaster_smoothed_scaled.R @@ -14,8 +14,17 @@ #' to the max time value of the `epi_df`. how to handle this is a modelling #' question left up to each forecaster; see latency_adjusting.R for the #' existing examples) -#' @param pop_scaling an example extra parameter unique to this forecaster -#' @param trainer an example extra parameter that is fairly common +#' @param pop_scaling bool; if `TRUE`, assume all numeric columns are on the +#' count scale and translate them to a rate scale for model fitting. +#' Predictions will be translated back to count scale. Any +#' `layer_residual_quantiles` (for non-`"quantile_reg"` `trainer`s) will be +#' done on the rate scale. When specifying predictor lags, note that rate +#' variables will use the same names as and overwrite the count variables. +#' Rates here will be counts per 100k population, based on +#' `epipredict::state_census`. +#' @param trainer optional; parsnip model specification to use for the core +#' fitting & prediction (the `spec` of the internal +#' [`epipredict::epi_workflow`]). Default is `parsnip::linear_reg()`. #' @param smooth_width the number of days over which to do smoothing. If `NULL`, #' then no smoothing is applied. #' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths @@ -34,57 +43,52 @@ #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling #' @importFrom tibble tibble #' @importFrom recipes all_numeric +#' @importFrom zeallot %<-% #' @export smoothed_scaled <- function(epi_data, - outcome, - extra_sources = "", - ahead = 1, - pop_scaling = TRUE, - trainer = parsnip::linear_reg(), - quantile_levels = covidhub_probs(), - smooth_width = 7, - smooth_cols = NULL, - sd_width = 28, - sd_mean_width = 14, - sd_cols = NULL, - ...) { + outcome, + extra_sources = "", + ahead = 1, + pop_scaling = TRUE, + trainer = parsnip::linear_reg(), + quantile_levels = covidhub_probs(), + smooth_width = 7, + smooth_cols = NULL, + sd_width = 28, + sd_mean_width = 14, + sd_cols = NULL, + ...) { # perform any preprocessing not supported by epipredict # this is a temp fix until a real fix gets put into epipredict epi_data <- clear_lastminute_nas(epi_data) # one that every forecaster will need to handle: how to manage max(time_value) # that's older than the `as_of` date - epidataAhead <- extend_ahead(epi_data, ahead) + c(epi_data, effective_ahead) %<-% extend_ahead(epi_data, ahead) # see latency_adjusting for other examples - # this next part is basically unavoidable boilerplate you'll want to copy - epi_data <- epidataAhead[[1]] - effective_ahead <- epidataAhead[[2]] args_input <- list(...) # edge case where there is no data or less data than the lags; eventually epipredict will handle this if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) { - null_result <- tibble( - geo_value = character(), - forecast_date = lubridate::Date(), - target_end_date = lubridate::Date(), - quantile = numeric(), - value = numeric() - ) + null_result <- epi_data[0L, c("geo_value", attr(epi_data, "metadata", exact = TRUE)[["other_keys"]])] %>% + mutate( + forecast_date = epi_data$time_value[0], + target_end_date = epi_data$time_value[0], + quantile = numeric(), + value = numeric() + ) return(null_result) } args_input[["ahead"]] <- effective_ahead args_input[["quantile_levels"]] <- quantile_levels args_list <- do.call(arx_args_list, args_input) - # if you want to ignore extra_sources, setting predictors is the way to do it + # `extra_sources` sets which variables beyond the outcome are lagged and used as predictors + # any which are modified by `rolling_mean` or `rolling_sd` have their original values dropped later predictors <- c(outcome, extra_sources) - # TODO: Partial match quantile_level coming from here (on Dmitry's machine) - argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) - args_list <- argsPredictorsTrainer[[1]] - predictors <- argsPredictorsTrainer[[2]] - trainer <- argsPredictorsTrainer[[3]] # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict # smoothing - keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately? + keep_mean <- !is.null(smooth_width) && !is.null(sd_mean_width) && + smooth_width == sd_mean_width # do we (not) need to do the mean separately? if (!is.null(smooth_width) && !keep_mean) { epi_data %<>% rolling_mean( width = smooth_width, @@ -101,8 +105,10 @@ smoothed_scaled <- function(epi_data, keep_mean = keep_mean ) } - # and need to make sure we exclude the original varialbes as predictors + # and need to make sure we exclude the original variables as predictors predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors) + # TODO: Partial match quantile_level coming from here (on Dmitry's machine) + c(args_list, predictors, trainer) %<-% perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) # preprocessing supported by epipredict preproc <- epi_recipe(epi_data) if (pop_scaling) { diff --git a/R/targets_utils.R b/R/targets_utils.R index 1743e5da..033a20b1 100644 --- a/R/targets_utils.R +++ b/R/targets_utils.R @@ -133,7 +133,7 @@ make_shared_grids <- function() { forecaster = "scaled_pop", trainer = c("linreg", "quantreg"), ahead = c(1:7, 14, 21, 28), - lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)), + lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)), pop_scaling = c(FALSE) ), tidyr::expand_grid( @@ -144,7 +144,7 @@ make_shared_grids <- function() { forecaster = "smoothed_scaled", trainer = c("quantreg"), ahead = c(1:7, 14, 21, 28), - lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)), + lags = list(list(c(0, 3, 5, 7, 14), c(0),c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14), c(0,2,4,7,14,21,28)), pop_scaling = c(FALSE) ) ) diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd index 700c9216..8f5faf10 100644 --- a/man/get_trainable_names.Rd +++ b/man/get_trainable_names.Rd @@ -7,7 +7,7 @@ get_trainable_names(epi_data, cols) } \arguments{ -\item{epi_data}{the epi_data tibble} +\item{epi_data}{the \code{epi_df}} \item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns} } diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd index 683c56d7..7306d3b3 100644 --- a/man/smoothed_scaled.Rd +++ b/man/smoothed_scaled.Rd @@ -34,9 +34,18 @@ to the max time value of the \code{epi_df}. how to handle this is a modelling question left up to each forecaster; see latency_adjusting.R for the existing examples)} -\item{pop_scaling}{an example extra parameter unique to this forecaster} +\item{pop_scaling}{bool; if \code{TRUE}, assume all numeric columns are on the +count scale and translate them to a rate scale for model fitting. +Predictions will be translated back to count scale. Any +\code{layer_residual_quantiles} (for non-\code{"quantile_reg"} \code{trainer}s) will be +done on the rate scale. When specifying predictor lags, note that rate +variables will use the same names as and overwrite the count variables. +Rates here will be counts per 100k population, based on +\code{epipredict::state_census}.} -\item{trainer}{an example extra parameter that is fairly common} +\item{trainer}{optional; parsnip model specification to use for the core +fitting & prediction (the \code{spec} of the internal +\code{\link[epipredict:epi_workflow]{epipredict::epi_workflow}}). Default is \code{parsnip::linear_reg()}.} \item{quantile_levels}{The quantile levels to predict. Defaults to those} diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd index 9b9e603c..435e5aba 100644 --- a/man/update_predictors.Rd +++ b/man/update_predictors.Rd @@ -7,12 +7,17 @@ update_predictors(epi_data, cols_modified, predictors) } \arguments{ -\item{epi_data}{the epi_df} +\item{epi_data}{the epi_df, only included to get the non-key column names} -\item{cols_modified}{the list of columns} +\item{cols_modified}{the list of columns which have been modified. If this is \code{NULL}, that means we were modifying every column.} -\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced} +\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").} +} +\value{ +returns an updated list of predictors, with modified columns replaced and non-modified columns left intact. } \description{ -should only be applied after both rolling_mean and rolling_sd +modifies the list of preditors so that any which have been modified have the +modified versions included, and not the original. Should only be applied +after both rolling_mean and rolling_sd. } diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R index 785b6ad2..0c61a8d8 100644 --- a/tests/testthat/test-forecasters-basics.R +++ b/tests/testthat/test-forecasters-basics.R @@ -5,7 +5,6 @@ forecasters <- list( c("flatline_fc", flatline_fc), c("smoothed_scaled", smoothed_scaled) ) -forecaster <- forecasters[[3]] for (forecaster in forecasters) { test_that(paste(forecaster[[1]], "gets the date and columns right"), { jhu <- epipredict::case_death_rate_subset %>% diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R index e6b79ed8..fdea75b2 100644 --- a/tests/testthat/test-transforms.R +++ b/tests/testthat/test-transforms.R @@ -52,14 +52,19 @@ test_that("rolling_sd generates correct standard deviation", { rolled <- rolling_sd(epi_data, keep_mean = TRUE) expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28")) # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5 - linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1)) - linear_roll_mean + linear_roll_mean <- c(seq(from = 1, to = 7, by = .5), seq(from = 8, to = 16, by = 1), seq(from = 16.5, to = 32.5, by = 1)) +## linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 14), seq(from = 8.5, to = 32.5, by = 1)) +## gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a) +## unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 14, 1))) +## map(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 13, 1))) +## linear_roll_mean +## rolled %>% filter(geo_value == "al") %>% pull("a_m14") expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean) # and the standard deviation is linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1) linear_values <- 1:39 linear_values <- append(linear_values, NA, after = removed_date - 1) - linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28)) + linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28 - 1)) # drop the extra date caused by the inclusion of the NAs linear_roll_sd <- linear_roll_sd[-(removed_date)] expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd) From 16dd527a250e3a02072360a08652d5abe3383148 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 16:21:17 -0800 Subject: [PATCH 16/20] fix tests (sd lag should only be 0) --- DESCRIPTION | 3 ++- R/data_validation.R | 2 +- R/forecaster_smoothed_scaled.R | 2 ++ tests/testthat/test-forecasters-data.R | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 21fbcc91..01a09444 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,7 +30,8 @@ Imports: slider, targets, tibble, - tidyr + tidyr, + zeallot Suggests: ggplot2, knitr, diff --git a/R/data_validation.R b/R/data_validation.R index da4c134e..2a0838e5 100644 --- a/R/data_validation.R +++ b/R/data_validation.R @@ -56,7 +56,7 @@ perform_sanity_checks <- function(epi_data, #' @export confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) { if (!is.null(args_input$lags)) { - lag_max <- max(args_input$lags) + lag_max <- max(unlist(args_input$lags)) } else { lag_max <- 14 # default value of 2 weeks } diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R index b1c02275..32a3929d 100644 --- a/R/forecaster_smoothed_scaled.R +++ b/R/forecaster_smoothed_scaled.R @@ -5,6 +5,8 @@ #' /target/ it still uses the raw value (this captures some of the noise). It #' also uses a rolling standard deviation as an auxillary signal, window of #' withd `sd_width`, which by default is 28 days. +#' If you are using `sd_width`, you should restrict the lags on the `sd` to only +#' include `0`, so set your lags to be e.g. `list(c(0,7,14), c(0))`. #' @param epi_data the actual data used #' @param outcome the name of the target variable #' @param extra_sources the name of any extra columns to use. This list could be diff --git a/tests/testthat/test-forecasters-data.R b/tests/testthat/test-forecasters-data.R index d12b4a40..4fc65f5d 100644 --- a/tests/testthat/test-forecasters-data.R +++ b/tests/testthat/test-forecasters-data.R @@ -7,7 +7,7 @@ forecasters <- tibble::tribble( scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"), "scaled_pop", scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"), "scaled_pop", flatline_fc, list(1), list("ahead"), "flatline_fc", - smoothed_scaled, list(1), list("ahead"), "smoothed_scaled" + smoothed_scaled, list(1, list(c(0,7,14), c(0))), list("ahead", "lags"), "smoothed_scaled" ) expects_nonequal <- c("scaled_pop", "smoothed_scaled") synth_mean <- 25 From 4b34428ba2ae67e3587644eef124b2604ebaa011 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 16:31:48 -0800 Subject: [PATCH 17/20] include smoothed_scaling in the targets --- R/targets_utils.R | 11 ++--------- covid_hosp_explore.R | 7 +++++++ flu_hosp_explore.R | 10 +++++++++- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/R/targets_utils.R b/R/targets_utils.R index 033a20b1..d3fd435e 100644 --- a/R/targets_utils.R +++ b/R/targets_utils.R @@ -127,26 +127,19 @@ make_shared_grids <- function() { forecaster = "scaled_pop", trainer = c("linreg", "quantreg"), ahead = c(1:7, 14, 21, 28), - pop_scaling = c(FALSE) + pop_scaling = FALSE ), tidyr::expand_grid( forecaster = "scaled_pop", trainer = c("linreg", "quantreg"), ahead = c(1:7, 14, 21, 28), lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)), - pop_scaling = c(FALSE) + pop_scaling = FALSE ), tidyr::expand_grid( forecaster = "flatline_fc", ahead = c(1:7, 14, 21, 28) ), - tidyr::expand_grid( - forecaster = "smoothed_scaled", - trainer = c("quantreg"), - ahead = c(1:7, 14, 21, 28), - lags = list(list(c(0, 3, 5, 7, 14), c(0),c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14), c(0,2,4,7,14,21,28)), - pop_scaling = c(FALSE) - ) ) } #' Make list of common ensembles for forecasting experiments across projects diff --git a/covid_hosp_explore.R b/covid_hosp_explore.R index f9aa0d8e..456b4b3e 100644 --- a/covid_hosp_explore.R +++ b/covid_hosp_explore.R @@ -18,6 +18,13 @@ make_unique_grids <- function() { ahead = c(1:7, 14, 21, 28), lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)), pop_scaling = TRUE + ), + tidyr::expand_grid( + forecaster = "smoothed_scaled", + trainer = c("quantreg"), + ahead = c(1:7, 14, 21, 28), + lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))), + pop_scaling = TRUE ) ) } diff --git a/flu_hosp_explore.R b/flu_hosp_explore.R index a3486080..31d0ffc1 100644 --- a/flu_hosp_explore.R +++ b/flu_hosp_explore.R @@ -5,7 +5,15 @@ source("extras/targets-common.R") # Add custom parameter combinations in the list below. make_unique_grids <- function() { - list() + list( + tidyr::expand_grid( + forecaster = "smoothed_scaled", + trainer = c("quantreg"), + ahead = c(1:7, 14, 21, 28), + lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))), + pop_scaling = FALSE + ) + ) } make_unique_ensemble_grid <- function() { tribble( From af3ff9535d3918d640f92a0280a077edde1b374b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 16:41:46 -0800 Subject: [PATCH 18/20] perform_sanity_checks -> sanitize_args_predictors_trainer --- NAMESPACE | 2 +- R/data_validation.R | 2 +- R/forecaster_flatline.R | 4 +--- R/forecaster_scaled_pop.R | 7 ++----- R/forecaster_smoothed_scaled.R | 4 ++-- ...checks.Rd => sanitize_args_predictors_trainer.Rd} | 12 +++++++++--- man/scaled_pop.Rd | 2 +- man/smoothed_scaled.Rd | 4 +++- tests/testthat/test-forecaster-utils.R | 6 +++--- 9 files changed, 23 insertions(+), 20 deletions(-) rename man/{perform_sanity_checks.Rd => sanitize_args_predictors_trainer.Rd} (85%) diff --git a/NAMESPACE b/NAMESPACE index e6f711c3..3298ff06 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,12 +28,12 @@ export(make_shared_grids) export(make_target_ensemble_grid) export(make_target_param_grid) export(overprediction) -export(perform_sanity_checks) export(read_external_predictions_data) export(rolling_mean) export(rolling_sd) export(run_evaluation_measure) export(run_workflow_and_format) +export(sanitize_args_predictors_trainer) export(scaled_pop) export(sharpness) export(single_id) diff --git a/R/data_validation.R b/R/data_validation.R index 2a0838e5..e9319334 100644 --- a/R/data_validation.R +++ b/R/data_validation.R @@ -13,7 +13,7 @@ #' include empty strings #' @param args_list the args list created by [`epipredict::arx_args_list`] #' @export -perform_sanity_checks <- function(epi_data, +sanitize_args_predictors_trainer <- function(epi_data, outcome, predictors, trainer, diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R index ab0c5075..687511b0 100644 --- a/R/forecaster_flatline.R +++ b/R/forecaster_flatline.R @@ -40,9 +40,7 @@ flatline_fc <- function(epi_data, args_list <- do.call(flatline_args_list, args_input) # if you want to ignore extra_sources, setting predictors is the way to do it predictors <- c(outcome, extra_sources) - argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list) - args_list <- argsPredictorsTrainer[[1]] - predictors <- argsPredictorsTrainer[[2]] + c(args_list, predictors) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R index 2b5d0020..c849cd9e 100644 --- a/R/forecaster_scaled_pop.R +++ b/R/forecaster_scaled_pop.R @@ -35,7 +35,7 @@ #' @param quantile_levels The quantile levels to predict. Defaults to those required by #' covidhub. #' @seealso some utilities for making forecasters: [format_storage], -#' [perform_sanity_checks] +#' [sanitize_args_predictors_trainer] #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling #' @importFrom tibble tibble #' @importFrom recipes all_numeric @@ -76,10 +76,7 @@ scaled_pop <- function(epi_data, # if you want to hardcode particular predictors in a particular forecaster predictors <- c(outcome, extra_sources) # TODO: Partial match quantile_level coming from here (on Dmitry's machine) - argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) - args_list <- argsPredictorsTrainer[[1]] - predictors <- argsPredictorsTrainer[[2]] - trainer <- argsPredictorsTrainer[[3]] + c(args_list, predictors, trainer) <- sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R index 32a3929d..8428cc08 100644 --- a/R/forecaster_smoothed_scaled.R +++ b/R/forecaster_smoothed_scaled.R @@ -41,7 +41,7 @@ #' @param ... any additional arguments as used by [arx_args_list] #' required by covidhub. #' @seealso some utilities for making forecasters: [format_storage], -#' [perform_sanity_checks] +#' [sanitize_args_predictors_trainer] #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling #' @importFrom tibble tibble #' @importFrom recipes all_numeric @@ -110,7 +110,7 @@ smoothed_scaled <- function(epi_data, # and need to make sure we exclude the original variables as predictors predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors) # TODO: Partial match quantile_level coming from here (on Dmitry's machine) - c(args_list, predictors, trainer) %<-% perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) # preprocessing supported by epipredict preproc <- epi_recipe(epi_data) if (pop_scaling) { diff --git a/man/perform_sanity_checks.Rd b/man/sanitize_args_predictors_trainer.Rd similarity index 85% rename from man/perform_sanity_checks.Rd rename to man/sanitize_args_predictors_trainer.Rd index 152b3bf1..b56706ee 100644 --- a/man/perform_sanity_checks.Rd +++ b/man/sanitize_args_predictors_trainer.Rd @@ -1,10 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_validation.R -\name{perform_sanity_checks} -\alias{perform_sanity_checks} +\name{sanitize_args_predictors_trainer} +\alias{sanitize_args_predictors_trainer} \title{helper function for those writing forecasters} \usage{ -perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) +sanitize_args_predictors_trainer( + epi_data, + outcome, + predictors, + trainer, + args_list +) } \arguments{ \item{epi_data}{the actual data used} diff --git a/man/scaled_pop.Rd b/man/scaled_pop.Rd index 880819eb..5522c85c 100644 --- a/man/scaled_pop.Rd +++ b/man/scaled_pop.Rd @@ -63,5 +63,5 @@ arguments: } \seealso{ some utilities for making forecasters: \link{format_storage}, -\link{perform_sanity_checks} +\link{sanitize_args_predictors_trainer} } diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd index 7306d3b3..16a3d2c0 100644 --- a/man/smoothed_scaled.Rd +++ b/man/smoothed_scaled.Rd @@ -73,8 +73,10 @@ the data. Even if the target is smoothed when used as a /predictor/, as a /target/ it still uses the raw value (this captures some of the noise). It also uses a rolling standard deviation as an auxillary signal, window of withd \code{sd_width}, which by default is 28 days. +If you are using \code{sd_width}, you should restrict the lags on the \code{sd} to only +include \code{0}, so set your lags to be e.g. \code{list(c(0,7,14), c(0))}. } \seealso{ some utilities for making forecasters: \link{format_storage}, -\link{perform_sanity_checks} +\link{sanitize_args_predictors_trainer} } diff --git a/tests/testthat/test-forecaster-utils.R b/tests/testthat/test-forecaster-utils.R index 863acc80..dfe5f4ed 100644 --- a/tests/testthat/test-forecaster-utils.R +++ b/tests/testthat/test-forecaster-utils.R @@ -1,10 +1,10 @@ -test_that("perform_sanity_checks", { +test_that("sanitize_args_predictors_trainer", { epi_data <- epipredict::case_death_rate_subset # don't need to test validate_forecaster_inputs as that's inherited # testing args_list inheritance ex_args <- arx_args_list() - expect_error(perform_sanity_checks(epi_data, "case_rate", c("case_rate"), 5, ex_args)) - argsPredictors <- perform_sanity_checks( + expect_error(sanitize_args_predictors_trainer(epi_data, "case_rate", c("case_rate"), 5, ex_args)) + argsPredictors <- sanitize_args_predictors_trainer( epi_data, "case_rate", c("case_rate", ""), parsnip::linear_reg(), ex_args ) args_list <- argsPredictors[[1]] From ebb9db3c92f57e3354dab6c777cb1a4f07708603 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 19:38:02 -0800 Subject: [PATCH 19/20] zeallot (%<-%) needs all args --- R/forecaster_flatline.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R index 687511b0..33687240 100644 --- a/R/forecaster_flatline.R +++ b/R/forecaster_flatline.R @@ -40,7 +40,7 @@ flatline_fc <- function(epi_data, args_list <- do.call(flatline_args_list, args_input) # if you want to ignore extra_sources, setting predictors is the way to do it predictors <- c(outcome, extra_sources) - c(args_list, predictors) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list) + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict From 15d35d784256cf82de8d0c54645e66e12bd789d4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 22 Dec 2023 19:59:23 -0800 Subject: [PATCH 20/20] fix %<-% usage --- R/forecaster_scaled_pop.R | 3 ++- tests/testthat/test-forecasters-basics.R | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R index c849cd9e..6c138caa 100644 --- a/R/forecaster_scaled_pop.R +++ b/R/forecaster_scaled_pop.R @@ -38,6 +38,7 @@ #' [sanitize_args_predictors_trainer] #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling #' @importFrom tibble tibble +#' @importFrom zeallot %<-% #' @importFrom recipes all_numeric #' @export scaled_pop <- function(epi_data, @@ -76,7 +77,7 @@ scaled_pop <- function(epi_data, # if you want to hardcode particular predictors in a particular forecaster predictors <- c(outcome, extra_sources) # TODO: Partial match quantile_level coming from here (on Dmitry's machine) - c(args_list, predictors, trainer) <- sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) + c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list) # end of the copypasta # finally, any other pre-processing (e.g. smoothing) that isn't performed by # epipredict diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R index 0c61a8d8..b0cc1da4 100644 --- a/tests/testthat/test-forecasters-basics.R +++ b/tests/testthat/test-forecasters-basics.R @@ -5,6 +5,7 @@ forecasters <- list( c("flatline_fc", flatline_fc), c("smoothed_scaled", smoothed_scaled) ) +forecaster <- c("scaled_pop", scaled_pop) for (forecaster in forecasters) { test_that(paste(forecaster[[1]], "gets the date and columns right"), { jhu <- epipredict::case_death_rate_subset %>%