From e4b3d4515c7bd32366aafd8fcfaeafac8fd7e0a3 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 8 Dec 2023 16:44:12 -0800
Subject: [PATCH 01/20] feat: rolling_mean/sd for a new forecaster

---
 DESCRIPTION                      |  1 +
 NAMESPACE                        |  5 +++
 R/data_transforms.R              | 69 ++++++++++++++++++++++++++++++++
 man/get_trainable_names.Rd       | 17 ++++++++
 man/rolling_mean.Rd              | 21 ++++++++++
 man/rolling_sd.Rd                | 33 +++++++++++++++
 man/smooth_scaled.Rd             | 65 ++++++++++++++++++++++++++++++
 tests/testthat/test-transforms.R | 37 +++++++++++++++++
 8 files changed, 248 insertions(+)
 create mode 100644 R/data_transforms.R
 create mode 100644 man/get_trainable_names.Rd
 create mode 100644 man/rolling_mean.Rd
 create mode 100644 man/rolling_sd.Rd
 create mode 100644 man/smooth_scaled.Rd
 create mode 100644 tests/testthat/test-transforms.R

diff --git a/DESCRIPTION b/DESCRIPTION
index ad32b71e..21fbcc91 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -27,6 +27,7 @@ Imports:
     purrr,
     recipes (>= 1.0.4),
     rlang,
+    slider,
     targets,
     tibble,
     tidyr
diff --git a/NAMESPACE b/NAMESPACE
index 72811f7d..849bac7c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -30,12 +30,15 @@ export(make_target_param_grid)
 export(overprediction)
 export(perform_sanity_checks)
 export(read_external_predictions_data)
+export(rolling_mean)
+export(rolling_sd)
 export(run_evaluation_measure)
 export(run_workflow_and_format)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
+export(smooth_scaled)
 export(underprediction)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
@@ -96,6 +99,8 @@ importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
+importFrom(slider,slide2_dbl)
+importFrom(slider,slide_dbl)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
 importFrom(targets,tar_read)
diff --git a/R/data_transforms.R b/R/data_transforms.R
new file mode 100644
index 00000000..99da29a6
--- /dev/null
+++ b/R/data_transforms.R
@@ -0,0 +1,69 @@
+# various reusable transforms to apply before handing to epipredict
+
+#' extract the non-key columns from epi_data
+#' @keywords internal
+#' @param epi_data the epi_data tibble
+#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
+get_trainable_names <- function(epi_data, cols) {
+  if (is.null(cols)) {
+    cols <- names(epi_data)
+    cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+  }
+  return(cols)
+}
+
+#' get a rolling average for the named columns
+#' @description
+#' add column(s) that are the rolling means of the specified columns, as
+#'   implemented by slider. Defaults to the previous 7 days.
+#' Currently only group_by's on the geo_value. Should probably extend to more
+#'   keys if you have them
+#' @param epi_data the dataset
+#' @param width the number of days (or examples, the sliding isn't time-aware) to use
+#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
+#' @importFrom slider slide_dbl
+#' @export
+rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
+  cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_mean) {
+    mean_name <- paste0(col, width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
+
+#' get a rolling standard deviation for the named columns
+#' @description
+#' A rolling standard deviation, based off of a rolling mean. First it
+#'   calculates a rolling mean with width `mean_width`, and then squares the
+#'   difference between that and the actual value, averaged over `sd_width`.
+#' @param epi_data the dataset
+#' @param sd_width the number of days (or examples, the sliding isn't
+#'   time-aware) to use for the standard deviation calculation
+#' @param mean_width like `sd_width`, but it governs the mean. Should be less
+#'   than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
+#'   (so 14 in the complete default case)
+#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
+#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
+#' @importFrom slider slide_dbl slide2_dbl
+#' @export
+rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
+  if (is.null(mean_width)) {
+    mean_width <- as.integer(ceiling(sd_width / 2))
+  }
+  cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_sd) {
+    mean_name <- paste0(col, "_m", mean_width)
+    sd_name <- paste0(col, "_SD", sd_width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
+    epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
+    if (!keep_mean) {
+      epi_data %<>% select(-{{ mean_name }})
+    }
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd
new file mode 100644
index 00000000..42650dbe
--- /dev/null
+++ b/man/get_trainable_names.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{get_trainable_names}
+\alias{get_trainable_names}
+\title{extract the non-key columns from epi_data}
+\usage{
+get_trainable_names(epi_data, cols)
+}
+\arguments{
+\item{epi_data}{the epi_data tibble}
+
+\item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns}
+}
+\description{
+extract the non-key columns from epi_data
+}
+\keyword{internal}
diff --git a/man/rolling_mean.Rd b/man/rolling_mean.Rd
new file mode 100644
index 00000000..f8cf5000
--- /dev/null
+++ b/man/rolling_mean.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{rolling_mean}
+\alias{rolling_mean}
+\title{get a rolling average for the named columns}
+\usage{
+rolling_mean(epi_data, width = 7L, cols_to_mean = NULL)
+}
+\arguments{
+\item{epi_data}{the dataset}
+
+\item{width}{the number of days (or examples, the sliding isn't time-aware) to use}
+
+\item{cols_to_mean}{the non-key columns to take the mean over. \code{NULL} means all}
+}
+\description{
+add column(s) that are the rolling means of the specified columns, as
+implemented by slider. Defaults to the previous 7 days.
+Currently only group_by's on the geo_value. Should probably extend to more
+keys if you have them
+}
diff --git a/man/rolling_sd.Rd b/man/rolling_sd.Rd
new file mode 100644
index 00000000..22f3ec0c
--- /dev/null
+++ b/man/rolling_sd.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{rolling_sd}
+\alias{rolling_sd}
+\title{get a rolling standard deviation for the named columns}
+\usage{
+rolling_sd(
+  epi_data,
+  sd_width = 28L,
+  mean_width = NULL,
+  cols_to_sd = NULL,
+  keep_mean = FALSE
+)
+}
+\arguments{
+\item{epi_data}{the dataset}
+
+\item{sd_width}{the number of days (or examples, the sliding isn't
+time-aware) to use for the standard deviation calculation}
+
+\item{mean_width}{like \code{sd_width}, but it governs the mean. Should be less
+than the \code{sd_width}, and if \code{NULL} (the default) it is half of \code{sd_width}
+(so 14 in the complete default case)}
+
+\item{cols_to_sd}{the non-key columns to take the sd over. \code{NULL} means all}
+
+\item{keep_mean}{bool, if \code{TRUE}, it retains keeps the mean column}
+}
+\description{
+A rolling standard deviation, based off of a rolling mean. First it
+calculates a rolling mean with width \code{mean_width}, and then squares the
+difference between that and the actual value, averaged over \code{sd_width}.
+}
diff --git a/man/smooth_scaled.Rd b/man/smooth_scaled.Rd
new file mode 100644
index 00000000..65698932
--- /dev/null
+++ b/man/smooth_scaled.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/forecaster_smoothed_scaled.R
+\name{smooth_scaled}
+\alias{smooth_scaled}
+\title{predict on smoothed data and the standard deviation}
+\usage{
+smooth_scaled(
+  epi_data,
+  outcome,
+  extra_sources = "",
+  ahead = 1,
+  pop_scaling = TRUE,
+  trainer = parsnip::linear_reg(),
+  quantile_levels = covidhub_probs(),
+  smooth_days = 7,
+  smooth_cols = NULL,
+  sd_days = 28,
+  sd_cols = NULL,
+  ...
+)
+}
+\arguments{
+\item{epi_data}{the actual data used}
+
+\item{outcome}{the name of the target variable}
+
+\item{extra_sources}{the name of any extra columns to use. This list could be
+empty}
+
+\item{ahead}{(this is relative to the \code{as_of} field of the \code{epi_df}, which is
+likely \emph{not} the same as the \code{ahead} used by epipredict, which is relative
+to the max time value of the \code{epi_df}. how to handle this is a modelling
+question left up to each forecaster; see latency_adjusting.R for the
+existing examples)}
+
+\item{pop_scaling}{an example extra parameter unique to this forecaster}
+
+\item{trainer}{an example extra parameter that is fairly common}
+
+\item{quantile_levels}{The quantile levels to predict. Defaults to those
+required by covidhub.}
+
+\item{smooth_days}{the number of days over which to do smoothing. If \code{NULL},
+then no smoothing is applied.}
+
+\item{smooth_cols}{the names of the columns to smooth. If \code{NULL} it smooths
+everything}
+
+\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes
+the sd ofeverything}
+
+\item{sd_width}{the number of days over which to take a moving average of the
+standard deviation. If \code{NULL}, the sd_width isn't included.}
+}
+\description{
+This is a variant of \code{scaled_pop}, which predicts on a smoothed version of
+the data. Even if the target is smoothed when used as a /predictor/, as a
+/target/ it still uses the raw value (this captures some of the noise).  It
+also uses a rolling standard deviation as an auxillary signal, window of
+withd \code{sd_width}, which by default is 28 days.
+}
+\seealso{
+some utilities for making forecasters: \link{format_storage},
+\link{perform_sanity_checks}
+}
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
new file mode 100644
index 00000000..ea762391
--- /dev/null
+++ b/tests/testthat/test-transforms.R
@@ -0,0 +1,37 @@
+n_days <- 40
+simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
+rand_vals <- rnorm(n_days)
+epi_data <- epiprocess::as_epi_df(rbind(tibble(
+geo_value = "al",
+time_value = simple_dates,
+a = 1:n_days,
+b = rand_vals
+), tibble(
+geo_value = "ca",
+time_value = simple_dates,
+a = n_days:1,
+b = rand_vals + 10
+)))
+test_that("rolling_mean generates correct mean", {
+  rolled <- rolling_mean(epi_data)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7"))
+  # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
+  linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean)
+  # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
+  linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean)
+})
+
+test_that("rolling_sd generates correct standard deviation", {
+  rolled <- rolling_sd(epi_data)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28"))
+  # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
+  linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
+  # and the standard deviation is
+  linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
+  # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
+  })
+# TODO example with NA's, example with missing days, only one column, keep_mean

From 4a788103a7a7dcf4ad949c767e931e546d8edf99 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 11 Dec 2023 15:27:36 -0800
Subject: [PATCH 02/20] consistent name, only smooth non-smoothed, init
 forecaster

---
 NAMESPACE                                    |   2 +-
 R/data_transforms.R                          |   6 +-
 R/forecaster_smoothed_scaled.R               | 138 +++++++++++++++++++
 man/get_trainable_names.Rd                   |   4 +-
 man/{smooth_scaled.Rd => smoothed_scaled.Rd} |  22 +--
 tests/testthat/test-forecasters-basics.R     |   3 +-
 tests/testthat/test-transforms.R             |  35 +++--
 7 files changed, 180 insertions(+), 30 deletions(-)
 create mode 100644 R/forecaster_smoothed_scaled.R
 rename man/{smooth_scaled.Rd => smoothed_scaled.Rd} (86%)

diff --git a/NAMESPACE b/NAMESPACE
index 849bac7c..f65be2b1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -38,7 +38,7 @@ export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
-export(smooth_scaled)
+export(smoothed_scaled)
 export(underprediction)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
diff --git a/R/data_transforms.R b/R/data_transforms.R
index 99da29a6..43164f4a 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -1,6 +1,6 @@
 # various reusable transforms to apply before handing to epipredict
 
-#' extract the non-key columns from epi_data
+#' extract the non-key, non-smoothed columns from epi_data
 #' @keywords internal
 #' @param epi_data the epi_data tibble
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
@@ -8,6 +8,8 @@ get_trainable_names <- function(epi_data, cols) {
   if (is.null(cols)) {
     cols <- names(epi_data)
     cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+    # exclude anything with the same naming schema as the rolling average/sd created below
+    cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
   }
   return(cols)
 }
@@ -27,7 +29,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
-    mean_name <- paste0(col, width)
+    mean_name <- paste0(col, "_m", width)
     epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
   }
   epi_data %<>% ungroup()
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
new file mode 100644
index 00000000..0edbaa78
--- /dev/null
+++ b/R/forecaster_smoothed_scaled.R
@@ -0,0 +1,138 @@
+#' predict on smoothed data and the standard deviation
+#' @description
+#' This is a variant of `scaled_pop`, which predicts on a smoothed version of
+#'   the data. Even if the target is smoothed when used as a /predictor/, as a
+#'   /target/ it still uses the raw value (this captures some of the noise).  It
+#'   also uses a rolling standard deviation as an auxillary signal, window of
+#'   withd `sd_width`, which by default is 28 days.
+#' @param epi_data the actual data used
+#' @param outcome the name of the target variable
+#' @param extra_sources the name of any extra columns to use. This list could be
+#'   empty
+#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is
+#'   likely *not* the same as the `ahead` used by epipredict, which is relative
+#'   to the max time value of the `epi_df`. how to handle this is a modelling
+#'   question left up to each forecaster; see latency_adjusting.R for the
+#'   existing examples)
+#' @param pop_scaling an example extra parameter unique to this forecaster
+#' @param trainer an example extra parameter that is fairly common
+#' @param smooth_width the number of days over which to do smoothing. If `NULL`,
+#'   then no smoothing is applied.
+#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
+#'   everything
+#' @param sd_width the number of days over which to take a moving average of the
+#'   standard deviation. If `NULL`, the sd_width isn't included.
+#' @param sd_mean_width to calculate the sd, we need a window size for the mean
+#'   used.
+#' @param sd_cols the names of the columns to smooth. If `NULL` its includes
+#'   the sd of everything
+#' @param quantile_levels The quantile levels to predict. Defaults to those
+#'   required by covidhub.
+#' @seealso some utilities for making forecasters: [format_storage],
+#'   [perform_sanity_checks]
+#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
+#' @importFrom tibble tibble
+#' @importFrom recipes all_numeric
+#' @export
+smoothed_scaled <- function(epi_data,
+                          outcome,
+                          extra_sources = "",
+                          ahead = 1,
+                          pop_scaling = TRUE,
+                          trainer = parsnip::linear_reg(),
+                          quantile_levels = covidhub_probs(),
+                          smooth_width = 7,
+                          smooth_cols = NULL,
+                          sd_width = 28,
+                          sd_mean_width = 14,
+                          sd_cols = NULL,
+                          ...) {
+  # perform any preprocessing not supported by epipredict
+  # this is a temp fix until a real fix gets put into epipredict
+  epi_data <- clear_lastminute_nas(epi_data)
+  # one that every forecaster will need to handle: how to manage max(time_value)
+  # that's older than the `as_of` date
+  epidataAhead <- extend_ahead(epi_data, ahead)
+  # see latency_adjusting for other examples
+  # this next part is basically unavoidable boilerplate you'll want to copy
+  epi_data <- epidataAhead[[1]]
+  effective_ahead <- epidataAhead[[2]]
+  args_input <- list(...)
+  # edge case where there is no data or less data than the lags; eventually epipredict will handle this
+  if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
+    null_result <- tibble(
+      geo_value = character(),
+      forecast_date = lubridate::Date(),
+      target_end_date = lubridate::Date(),
+      quantile = numeric(),
+      value = numeric()
+    )
+    return(null_result)
+  }
+  args_input[["ahead"]] <- effective_ahead
+  args_input[["quantile_levels"]] <- quantile_levels
+  args_list <- do.call(arx_args_list, args_input)
+  # if you want to ignore extra_sources, setting predictors is the way to do it
+  predictors <- c(outcome, extra_sources)
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
+  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
+  args_list <- argsPredictorsTrainer[[1]]
+  predictors <- argsPredictorsTrainer[[2]]
+  trainer <- argsPredictorsTrainer[[3]]
+  # end of the copypasta
+  # finally, any other pre-processing (e.g. smoothing) that isn't performed by
+  # epipredict
+  # smoothing
+  keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
+  if (!is.null(smooth_width) && !keep_mean) {
+    epi_data %<>% rolling_mean(
+      width = smooth_width,
+      cols_to_mean = smooth_cols
+    )
+  }
+
+  # measuring standard deviation
+  if (!is.null(sd_width)) {
+    epi_data %<>% rolling_sd(
+      sd_width = sd_width,
+      mean_width = sd_mean_width,
+      cols_to_sd = sd_cols,
+      keep_mean = keep_mean
+    )
+  }
+  # even
+
+  # preprocessing supported by epipredict
+  preproc <- epi_recipe(epi_data)
+  if (pop_scaling) {
+    preproc %<>% step_population_scaling(
+      all_numeric(),
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  preproc %<>% arx_preprocess(outcome, predictors, args_list)
+
+  # postprocessing supported by epipredict
+  postproc <- frosting()
+  postproc %<>% arx_postprocess(trainer, args_list)
+  if (pop_scaling) {
+    postproc %<>% layer_population_scaling(
+      .pred, .pred_distn,
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  # with all the setup done, we execute and format
+  pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data)
+  # now pred has the columns
+  # (geo_value, forecast_date, target_end_date, quantile, value)
+  # finally, any postprocessing not supported by epipredict e.g. calibration
+  return(pred)
+}
diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd
index 42650dbe..700c9216 100644
--- a/man/get_trainable_names.Rd
+++ b/man/get_trainable_names.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/data_transforms.R
 \name{get_trainable_names}
 \alias{get_trainable_names}
-\title{extract the non-key columns from epi_data}
+\title{extract the non-key, non-smoothed columns from epi_data}
 \usage{
 get_trainable_names(epi_data, cols)
 }
@@ -12,6 +12,6 @@ get_trainable_names(epi_data, cols)
 \item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns}
 }
 \description{
-extract the non-key columns from epi_data
+extract the non-key, non-smoothed columns from epi_data
 }
 \keyword{internal}
diff --git a/man/smooth_scaled.Rd b/man/smoothed_scaled.Rd
similarity index 86%
rename from man/smooth_scaled.Rd
rename to man/smoothed_scaled.Rd
index 65698932..4a079fa4 100644
--- a/man/smooth_scaled.Rd
+++ b/man/smoothed_scaled.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/forecaster_smoothed_scaled.R
-\name{smooth_scaled}
-\alias{smooth_scaled}
+\name{smoothed_scaled}
+\alias{smoothed_scaled}
 \title{predict on smoothed data and the standard deviation}
 \usage{
-smooth_scaled(
+smoothed_scaled(
   epi_data,
   outcome,
   extra_sources = "",
@@ -12,9 +12,10 @@ smooth_scaled(
   pop_scaling = TRUE,
   trainer = parsnip::linear_reg(),
   quantile_levels = covidhub_probs(),
-  smooth_days = 7,
+  smooth_width = 7,
   smooth_cols = NULL,
-  sd_days = 28,
+  sd_width = 28,
+  sd_mean_width = 14,
   sd_cols = NULL,
   ...
 )
@@ -40,17 +41,20 @@ existing examples)}
 \item{quantile_levels}{The quantile levels to predict. Defaults to those
 required by covidhub.}
 
-\item{smooth_days}{the number of days over which to do smoothing. If \code{NULL},
+\item{smooth_width}{the number of days over which to do smoothing. If \code{NULL},
 then no smoothing is applied.}
 
 \item{smooth_cols}{the names of the columns to smooth. If \code{NULL} it smooths
 everything}
 
-\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes
-the sd ofeverything}
-
 \item{sd_width}{the number of days over which to take a moving average of the
 standard deviation. If \code{NULL}, the sd_width isn't included.}
+
+\item{sd_mean_width}{to calculate the sd, we need a window size for the mean
+used.}
+
+\item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes
+the sd of everything}
 }
 \description{
 This is a variant of \code{scaled_pop}, which predicts on a smoothed version of
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
index c9d7a929..0c61a8d8 100644
--- a/tests/testthat/test-forecasters-basics.R
+++ b/tests/testthat/test-forecasters-basics.R
@@ -2,7 +2,8 @@ library(dplyr)
 # TODO better way to do this than copypasta
 forecasters <- list(
   c("scaled_pop", scaled_pop),
-  c("flatline_fc", flatline_fc)
+  c("flatline_fc", flatline_fc),
+  c("smoothed_scaled", smoothed_scaled)
 )
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index ea762391..3db526b4 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -2,36 +2,41 @@ n_days <- 40
 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
 rand_vals <- rnorm(n_days)
 epi_data <- epiprocess::as_epi_df(rbind(tibble(
-geo_value = "al",
-time_value = simple_dates,
-a = 1:n_days,
-b = rand_vals
+  geo_value = "al",
+  time_value = simple_dates,
+  a = 1:n_days,
+  b = rand_vals
 ), tibble(
-geo_value = "ca",
-time_value = simple_dates,
-a = n_days:1,
-b = rand_vals + 10
+  geo_value = "ca",
+  time_value = simple_dates,
+  a = n_days:1,
+  b = rand_vals + 10
 )))
 test_that("rolling_mean generates correct mean", {
   rolled <- rolling_mean(epi_data)
-  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7"))
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
-  linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
-  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean)
+  linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), linear_roll_mean)
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
-  linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
-  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean)
+  linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean)
 })
 
 test_that("rolling_sd generates correct standard deviation", {
   rolled <- rolling_sd(epi_data)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
-  linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
+  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
   # and the standard deviation is
   linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28))
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
   # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
-  })
+})
+testthat("get_trainable_names pulls out mean and sd columns", {
+  rolled <- rolling_sd(epi_data, keep_mean = TRUE)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28"))
+  expect_equal(get_trainable_names(rolled, NULL), c("a", "b"))
+})
 # TODO example with NA's, example with missing days, only one column, keep_mean

From 1edc6f6d5411ea370a341c40ede5f6f1eaeed212 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 13 Dec 2023 14:57:33 -0800
Subject: [PATCH 03/20] smoothed_scaled passes all forecaster tests

---
 NAMESPACE                                |  4 ++
 R/data_transforms.R                      | 60 +++++++++++++++++++++++-
 R/epipredict_utilities.R                 |  8 ++--
 R/forecaster_smoothed_scaled.R           |  4 +-
 man/arx_preprocess.Rd                    |  6 +--
 man/cache_metadata.Rd                    | 11 +++++
 man/get_nonkey_names.Rd                  | 14 ++++++
 man/update_predictors.Rd                 | 16 +++++++
 tests/testthat/test-forecasters-basics.R |  1 +
 tests/testthat/test-transforms.R         | 19 +++++++-
 10 files changed, 131 insertions(+), 12 deletions(-)
 create mode 100644 man/cache_metadata.Rd
 create mode 100644 man/get_nonkey_names.Rd
 create mode 100644 man/update_predictors.Rd

diff --git a/NAMESPACE b/NAMESPACE
index f65be2b1..8b1721f6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -40,6 +40,7 @@ export(single_id)
 export(slide_forecaster)
 export(smoothed_scaled)
 export(underprediction)
+export(update_predictors)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
 importFrom(cli,cli_abort)
@@ -88,9 +89,12 @@ importFrom(epiprocess,epix_slide)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
+importFrom(purrr,list_modify)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
+importFrom(purrr,map_chr)
 importFrom(purrr,map_vec)
+importFrom(purrr,reduce)
 importFrom(purrr,transpose)
 importFrom(recipes,all_numeric)
 importFrom(rlang,"!!")
diff --git a/R/data_transforms.R b/R/data_transforms.R
index 43164f4a..04a71494 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -6,14 +6,48 @@
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
 get_trainable_names <- function(epi_data, cols) {
   if (is.null(cols)) {
-    cols <- names(epi_data)
-    cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+    cols <- get_nonkey_names(epi_data)
     # exclude anything with the same naming schema as the rolling average/sd created below
     cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
   }
   return(cols)
 }
 
+#' just the names which aren't keys for an epi_df
+#' @description
+#' names, but it excludes keys
+#' @param epi_data the epi_df
+get_nonkey_names <- function(epi_data) {
+  cols <- names(epi_data)
+  cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+}
+
+
+#' update the predictors to only contain the smoothed/sd versions of cols
+#' @description
+#' should only be applied after both rolling_mean and rolling_sd
+#' @param epi_data the epi_df
+#' @param cols the list of columns
+#' @importFrom purrr map map_chr reduce
+#' @export
+update_predictors <- function(epi_data, cols_modified, predictors) {
+  if (!is.null(cols_modified)) {
+    # if cols_modified isn't null, make sure we include predictors that weren't modified
+    other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`)
+    other_predictors <- predictors[other_predictors]
+  } else {
+    other_predictors <- c()
+  }
+  # all the non-key names
+  col_names <- get_nonkey_names(epi_data)
+  is_present <- function(x) {
+    grepl(x, col_names) & !(col_names %in% predictors)
+  }
+  is_modified <- map(predictors, is_present) %>% reduce(`|`)
+  new_predictors <- col_names[is_modified]
+  return(c(other_predictors, new_predictors))
+}
+
 #' get a rolling average for the named columns
 #' @description
 #' add column(s) that are the rolling means of the specified columns, as
@@ -36,6 +70,25 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   return(epi_data)
 }
 
+#' store the metadata in a easy to reapply way
+#' @importFrom purrr list_modify
+cache_metadata <- function(epi_data) {
+  features <- list()
+  all_others <- attributes(epi_data)$metadata
+  all_others["geo_type"] <- NULL
+  all_others["time_type"] <- NULL
+  all_others["as_of"] <- NULL
+  if (length(all_others) == 0) {
+   all_others <- list()
+  }
+  features <- list(
+    as_of = attributes(epi_data)$metadata$as_of,
+    geo_type = attributes(epi_data)$metadata$geo_type,
+    time_type = attributes(epi_data)$metadata$time_type, all_others = all_others
+  )
+  return(features)
+}
+
 #' get a rolling standard deviation for the named columns
 #' @description
 #' A rolling standard deviation, based off of a rolling mean. First it
@@ -56,6 +109,7 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     mean_width <- as.integer(ceiling(sd_width / 2))
   }
   cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  metadata <- cache_metadata(epi_data)
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_sd) {
     mean_name <- paste0(col, "_m", mean_width)
@@ -63,8 +117,10 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
     epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
     if (!keep_mean) {
+      # TODO make sure the extra info sticks around
       epi_data %<>% select(-{{ mean_name }})
     }
+    epi_data %<>%  as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others)
   }
   epi_data %<>% ungroup()
   return(epi_data)
diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R
index fc56925f..9e826273 100644
--- a/R/epipredict_utilities.R
+++ b/R/epipredict_utilities.R
@@ -9,18 +9,18 @@
 #' @seealso [arx_postprocess] for the layer equivalent
 #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
 #' @export
-arx_preprocess <- function(rec, outcome, predictors, args_list) {
+arx_preprocess <- function(preproc, outcome, predictors, args_list) {
   # input already validated
   lags <- args_list$lags
   for (l in seq_along(lags)) {
     p <- predictors[l]
-    rec %<>% step_epi_lag(!!p, lag = lags[[l]])
+    preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
   }
-  rec %<>%
+  preproc %<>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training)
-  return(rec)
+  return(preproc)
 }
 
 # TODO replace with `layer_arx_forecaster`
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
index 0edbaa78..12e93315 100644
--- a/R/forecaster_smoothed_scaled.R
+++ b/R/forecaster_smoothed_scaled.R
@@ -100,8 +100,8 @@ smoothed_scaled <- function(epi_data,
       keep_mean = keep_mean
     )
   }
-  # even
-
+  # and need to make sure we exclude the original varialbes as predictors
+  predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
   # preprocessing supported by epipredict
   preproc <- epi_recipe(epi_data)
   if (pop_scaling) {
diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd
index 35e2e0b0..1b24a41c 100644
--- a/man/arx_preprocess.Rd
+++ b/man/arx_preprocess.Rd
@@ -4,16 +4,16 @@
 \alias{arx_preprocess}
 \title{add the default steps for arx_forecaster}
 \usage{
-arx_preprocess(rec, outcome, predictors, args_list)
+arx_preprocess(preproc, outcome, predictors, args_list)
 }
 \arguments{
-\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}}
-
 \item{outcome}{a character of the column to be predicted}
 
 \item{predictors}{a character vector of the columns used as predictors}
 
 \item{args_list}{an \code{\link[epipredict:arx_args_list]{epipredict::arx_args_list}}}
+
+\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}}
 }
 \description{
 add the default steps for arx_forecaster
diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd
new file mode 100644
index 00000000..14a6d64b
--- /dev/null
+++ b/man/cache_metadata.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{cache_metadata}
+\alias{cache_metadata}
+\title{store the metadata in a easy to reapply way}
+\usage{
+cache_metadata(epi_data)
+}
+\description{
+store the metadata in a easy to reapply way
+}
diff --git a/man/get_nonkey_names.Rd b/man/get_nonkey_names.Rd
new file mode 100644
index 00000000..242ebabd
--- /dev/null
+++ b/man/get_nonkey_names.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{get_nonkey_names}
+\alias{get_nonkey_names}
+\title{just the names which aren't keys for an epi_df}
+\usage{
+get_nonkey_names(epi_data)
+}
+\arguments{
+\item{epi_data}{the epi_df}
+}
+\description{
+names, but it excludes keys
+}
diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd
new file mode 100644
index 00000000..e7d55b21
--- /dev/null
+++ b/man/update_predictors.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_transforms.R
+\name{update_predictors}
+\alias{update_predictors}
+\title{update the predictors to only contain the smoothed/sd versions of cols}
+\usage{
+update_predictors(epi_data, cols_modified, predictors)
+}
+\arguments{
+\item{epi_data}{the epi_df}
+
+\item{cols}{the list of columns}
+}
+\description{
+should only be applied after both rolling_mean and rolling_sd
+}
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
index 0c61a8d8..785b6ad2 100644
--- a/tests/testthat/test-forecasters-basics.R
+++ b/tests/testthat/test-forecasters-basics.R
@@ -5,6 +5,7 @@ forecasters <- list(
   c("flatline_fc", flatline_fc),
   c("smoothed_scaled", smoothed_scaled)
 )
+forecaster <- forecasters[[3]]
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index 3db526b4..96fc0df6 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -21,6 +21,7 @@ test_that("rolling_mean generates correct mean", {
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
   linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean)
+  expect_true("epi_df" %in% class(rolled))
 })
 
 test_that("rolling_sd generates correct standard deviation", {
@@ -33,10 +34,26 @@ test_that("rolling_sd generates correct standard deviation", {
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
   # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
+  # doesn't break types
+  expect_true("epi_df" %in% class(rolled))
 })
-testthat("get_trainable_names pulls out mean and sd columns", {
+
+test_that("get_trainable_names pulls out mean and sd columns", {
   rolled <- rolling_sd(epi_data, keep_mean = TRUE)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28"))
   expect_equal(get_trainable_names(rolled, NULL), c("a", "b"))
 })
 # TODO example with NA's, example with missing days, only one column, keep_mean
+
+test_that("update_predictors keeps unmodified predictors", {
+  epi_data["c"] = NaN
+  epi_data["d"] = NaN
+  epi_data["b_m14"] = NaN
+  epi_data["b_SD28"] = NaN
+  predictors <- c("a", "b", "c") # everything but d
+  modified <- c("b", "c") # we want to exclude b but not its modified versions
+  expected_predictors <- c("a", "b_m14", "b_SD28")
+  expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors)
+  expected_if_all_modified <- c("b_m14", "b_SD28")
+  expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified)
+})

From e63be8924f437492458eb051c9ece4cfbc87807e Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 13 Dec 2023 15:35:43 -0800
Subject: [PATCH 04/20] smoothed_scaled data tests

---
 tests/testthat/test-forecasters-data.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-forecasters-data.R b/tests/testthat/test-forecasters-data.R
index 365811dd..d12b4a40 100644
--- a/tests/testthat/test-forecasters-data.R
+++ b/tests/testthat/test-forecasters-data.R
@@ -6,8 +6,10 @@ forecasters <- tibble::tribble(
   ~forecaster, ~extra_params, ~extra_params_names, ~fc_name,
   scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"), "scaled_pop",
   scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"), "scaled_pop",
-  flatline_fc, list(1), list("ahead"), "flatline_fc"
+  flatline_fc, list(1), list("ahead"), "flatline_fc",
+  smoothed_scaled, list(1), list("ahead"), "smoothed_scaled"
 )
+expects_nonequal <- c("scaled_pop", "smoothed_scaled")
 synth_mean <- 25
 synth_sd <- 2
 tiny_sd <- 1.0e-5
@@ -49,7 +51,7 @@ different_constants <- epiprocess::as_epi_archive(rbind(
 ))
 for (ii in 1:nrow(forecasters)) {
   test_that(paste(forecasters$fc_name[[ii]], " predicts a constant median for constant data"), {
-    if (forecasters$fc_name[[ii]] == "scaled_pop") {
+    if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) {
       suppressWarnings(expect_warning(res <- get_pred(different_constants, ii), regexp = "prediction from rank-deficient fit"))
     } else {
       res <- get_pred(different_constants, ii)
@@ -126,7 +128,7 @@ missing_state <- epiprocess::as_epi_archive(rbind(
 ))
 for (ii in seq_len(nrow(forecasters))) {
   test_that(paste(forecasters$fc_name[[ii]], "predicts well in the presence of only one state with variably delayed data"), {
-    if (forecasters$fc_name[[ii]] == "scaled_pop") {
+    if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) {
       suppressWarnings(expect_warning(res <- get_pred(missing_state, ii), regexp = "prediction from rank-deficient fit"))
     } else {
       res <- get_pred(missing_state, ii)
@@ -171,7 +173,7 @@ for (ii in seq_len(nrow(forecasters))) {
   test_that(paste(forecasters$fc_name[[ii]], "predicts a linear increasing slope correctly"), {
     # flatline will definitely fail this, so it's exempt
     if (!identical(forecasters$forecaster[[ii]], flatline_fc)) {
-      if (forecasters$fc_name[[ii]] == "scaled_pop") {
+      if (any(forecasters$fc_name[[ii]] %in% expects_nonequal)) {
         suppressWarnings(expect_warning(res <- get_pred(linear, ii), regexp = "prediction from rank-deficient fit"))
       } else {
         res <- get_pred(linear, ii)

From ea335c9f418d19d81ccbdfc734d2efa2a4eb6838 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 14 Dec 2023 10:42:48 -0800
Subject: [PATCH 05/20] docfix, points no longer oversized

---
 R/epipredict_utilities.R | 2 +-
 app.R                    | 2 +-
 man/arx_preprocess.Rd    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R
index 9e826273..083ce35f 100644
--- a/R/epipredict_utilities.R
+++ b/R/epipredict_utilities.R
@@ -2,7 +2,7 @@
 #' add the default steps for arx_forecaster
 #' @description
 #' add the default steps for arx_forecaster
-#' @param rec an [`epipredict::epi_recipe`]
+#' @param preproc an [`epipredict::epi_recipe`]
 #' @param outcome a character of the column to be predicted
 #' @param predictors a character vector of the columns used as predictors
 #' @param args_list an [`epipredict::arx_args_list`]
diff --git a/app.R b/app.R
index c5ecf1a2..ef60b3c3 100644
--- a/app.R
+++ b/app.R
@@ -261,7 +261,7 @@ shinyApp(
         # Use scatterplot or lines depending on the x var.
         {
           if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) {
-            . + geom_point(aes(size = n)) + expand_limits(size = 0)
+            . + geom_point(aes(size = n/100)) + expand_limits(size = 0)
           } else {
             . + geom_line()
           }
diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd
index 1b24a41c..44ea895a 100644
--- a/man/arx_preprocess.Rd
+++ b/man/arx_preprocess.Rd
@@ -7,13 +7,13 @@
 arx_preprocess(preproc, outcome, predictors, args_list)
 }
 \arguments{
+\item{preproc}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}}
+
 \item{outcome}{a character of the column to be predicted}
 
 \item{predictors}{a character vector of the columns used as predictors}
 
 \item{args_list}{an \code{\link[epipredict:arx_args_list]{epipredict::arx_args_list}}}
-
-\item{rec}{an \code{\link[epipredict:epi_recipe]{epipredict::epi_recipe}}}
 }
 \description{
 add the default steps for arx_forecaster

From 8df5f05ce513453d55dc8c26e8f49e4a08036a9a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 14 Dec 2023 11:00:28 -0800
Subject: [PATCH 06/20] docs only tell one thing at a time

---
 R/data_transforms.R      | 6 +++++-
 app.R                    | 4 ++--
 man/cache_metadata.Rd    | 3 +++
 man/update_predictors.Rd | 4 +++-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/R/data_transforms.R b/R/data_transforms.R
index 04a71494..8f429f90 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -27,7 +27,8 @@ get_nonkey_names <- function(epi_data) {
 #' @description
 #' should only be applied after both rolling_mean and rolling_sd
 #' @param epi_data the epi_df
-#' @param cols the list of columns
+#' @param cols_modified the list of columns
+#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced
 #' @importFrom purrr map map_chr reduce
 #' @export
 update_predictors <- function(epi_data, cols_modified, predictors) {
@@ -71,6 +72,9 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
 }
 
 #' store the metadata in a easy to reapply way
+#' @description
+#' store the metadata in a easy to reapply way
+#' @param epi_data the epi_df
 #' @importFrom purrr list_modify
 cache_metadata <- function(epi_data) {
   features <- list()
diff --git a/app.R b/app.R
index ef60b3c3..064c4ffe 100644
--- a/app.R
+++ b/app.R
@@ -170,7 +170,7 @@ shinyApp(
           verticalLayout(
             plotlyOutput("main_plot", height = "90em"),
             h2("forecaster name -> parameters"),
-            #textOutput("forecaster_param_title"),
+            # textOutput("forecaster_param_title"),
             dataTableOutput("forecaster_table"),
             h2("ensemble name -> parameters"),
             dataTableOutput("ensemble_table")
@@ -261,7 +261,7 @@ shinyApp(
         # Use scatterplot or lines depending on the x var.
         {
           if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) {
-            . + geom_point(aes(size = n/100)) + expand_limits(size = 0)
+            . + geom_point(aes(size = n / 100)) + expand_limits(size = 0)
           } else {
             . + geom_line()
           }
diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd
index 14a6d64b..1dde416f 100644
--- a/man/cache_metadata.Rd
+++ b/man/cache_metadata.Rd
@@ -6,6 +6,9 @@
 \usage{
 cache_metadata(epi_data)
 }
+\arguments{
+\item{epi_data}{the epi_df}
+}
 \description{
 store the metadata in a easy to reapply way
 }
diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd
index e7d55b21..9b9e603c 100644
--- a/man/update_predictors.Rd
+++ b/man/update_predictors.Rd
@@ -9,7 +9,9 @@ update_predictors(epi_data, cols_modified, predictors)
 \arguments{
 \item{epi_data}{the epi_df}
 
-\item{cols}{the list of columns}
+\item{cols_modified}{the list of columns}
+
+\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced}
 }
 \description{
 should only be applied after both rolling_mean and rolling_sd

From 36608f6a917dafb105c9a03ce841be13a039d5a1 Mon Sep 17 00:00:00 2001
From: David Weber <david.weber2@pm.me>
Date: Tue, 19 Dec 2023 14:26:29 -0800
Subject: [PATCH 07/20] Update R/data_transforms.R

Co-authored-by: brookslogan <brooks.logan+github@gmail.com>
---
 R/data_transforms.R | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/R/data_transforms.R b/R/data_transforms.R
index 8f429f90..1434d3d0 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -113,19 +113,18 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     mean_width <- as.integer(ceiling(sd_width / 2))
   }
   cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
-  metadata <- cache_metadata(epi_data)
-  epi_data %<>% group_by(geo_value)
+  result <- epi_data
+  result %<>% group_by(geo_value)
   for (col in cols_to_sd) {
     mean_name <- paste0(col, "_m", mean_width)
     sd_name <- paste0(col, "_SD", sd_width)
-    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
-    epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
+    result %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
+    result %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
     if (!keep_mean) {
       # TODO make sure the extra info sticks around
-      epi_data %<>% select(-{{ mean_name }})
+      result %<>% select(-{{ mean_name }})
     }
-    epi_data %<>%  as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others)
   }
-  epi_data %<>% ungroup()
-  return(epi_data)
+  result %<>% ungroup()
+  result %<>% dplyr_reconstruct(epi_data)
 }

From 0eb61e3adf0cd24ccacf3cf8511acdee981510ff Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 19 Dec 2023 14:37:39 -0800
Subject: [PATCH 08/20] fix: warnings are one at a time, apparently

---
 R/forecaster_smoothed_scaled.R | 1 +
 man/smoothed_scaled.Rd         | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
index 12e93315..becfb3a3 100644
--- a/R/forecaster_smoothed_scaled.R
+++ b/R/forecaster_smoothed_scaled.R
@@ -27,6 +27,7 @@
 #' @param sd_cols the names of the columns to smooth. If `NULL` its includes
 #'   the sd of everything
 #' @param quantile_levels The quantile levels to predict. Defaults to those
+#' @param ... any additional arguments as used by [arx_args_list]
 #'   required by covidhub.
 #' @seealso some utilities for making forecasters: [format_storage],
 #'   [perform_sanity_checks]
diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd
index 4a079fa4..683c56d7 100644
--- a/man/smoothed_scaled.Rd
+++ b/man/smoothed_scaled.Rd
@@ -38,8 +38,7 @@ existing examples)}
 
 \item{trainer}{an example extra parameter that is fairly common}
 
-\item{quantile_levels}{The quantile levels to predict. Defaults to those
-required by covidhub.}
+\item{quantile_levels}{The quantile levels to predict. Defaults to those}
 
 \item{smooth_width}{the number of days over which to do smoothing. If \code{NULL},
 then no smoothing is applied.}
@@ -55,6 +54,9 @@ used.}
 
 \item{sd_cols}{the names of the columns to smooth. If \code{NULL} its includes
 the sd of everything}
+
+\item{...}{any additional arguments as used by \link{arx_args_list}
+required by covidhub.}
 }
 \description{
 This is a variant of \code{scaled_pop}, which predicts on a smoothed version of

From c50249b42560be4bde440f6ed838f2a7ecf78173 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 20 Dec 2023 11:05:04 -0800
Subject: [PATCH 09/20] switch to epi_slide, add logan's suggestions, NA tests

---
 NAMESPACE                        |  3 +-
 R/data_transforms.R              | 37 ++++--------------
 man/cache_metadata.Rd            | 14 -------
 tests/testthat/test-transforms.R | 67 ++++++++++++++++++++++++--------
 4 files changed, 59 insertions(+), 62 deletions(-)
 delete mode 100644 man/cache_metadata.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 8b1721f6..c4dd11ea 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -85,11 +85,11 @@ importFrom(epipredict,step_epi_naomit)
 importFrom(epipredict,step_population_scaling)
 importFrom(epipredict,step_training_window)
 importFrom(epiprocess,as_epi_df)
+importFrom(epiprocess,epi_slide)
 importFrom(epiprocess,epix_slide)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
-importFrom(purrr,list_modify)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
 importFrom(purrr,map_chr)
@@ -103,7 +103,6 @@ importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
-importFrom(slider,slide2_dbl)
 importFrom(slider,slide_dbl)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
diff --git a/R/data_transforms.R b/R/data_transforms.R
index 1434d3d0..b06f8779 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -59,40 +59,19 @@ update_predictors <- function(epi_data, cols_modified, predictors) {
 #' @param width the number of days (or examples, the sliding isn't time-aware) to use
 #' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
 #' @importFrom slider slide_dbl
+#' @importFrom epiprocess epi_slide
 #' @export
 rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
     mean_name <- paste0(col, "_m", width)
-    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
+    epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width, new_col_name = mean_name)
   }
   epi_data %<>% ungroup()
   return(epi_data)
 }
 
-#' store the metadata in a easy to reapply way
-#' @description
-#' store the metadata in a easy to reapply way
-#' @param epi_data the epi_df
-#' @importFrom purrr list_modify
-cache_metadata <- function(epi_data) {
-  features <- list()
-  all_others <- attributes(epi_data)$metadata
-  all_others["geo_type"] <- NULL
-  all_others["time_type"] <- NULL
-  all_others["as_of"] <- NULL
-  if (length(all_others) == 0) {
-   all_others <- list()
-  }
-  features <- list(
-    as_of = attributes(epi_data)$metadata$as_of,
-    geo_type = attributes(epi_data)$metadata$geo_type,
-    time_type = attributes(epi_data)$metadata$time_type, all_others = all_others
-  )
-  return(features)
-}
-
 #' get a rolling standard deviation for the named columns
 #' @description
 #' A rolling standard deviation, based off of a rolling mean. First it
@@ -106,7 +85,7 @@ cache_metadata <- function(epi_data) {
 #'   (so 14 in the complete default case)
 #' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
 #' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
-#' @importFrom slider slide_dbl slide2_dbl
+#' @importFrom epiprocess epi_slide
 #' @export
 rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
   if (is.null(mean_width)) {
@@ -114,17 +93,17 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
   }
   cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
   result <- epi_data
-  result %<>% group_by(geo_value)
   for (col in cols_to_sd) {
+    result %<>% group_by(geo_value)
     mean_name <- paste0(col, "_m", mean_width)
-    sd_name <- paste0(col, "_SD", sd_width)
-    result %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
-    result %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
+    sd_name <- paste0(col, "_sd", sd_width)
+    result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width, new_col_name = mean_name)
+    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width, new_col_name = sd_name)
     if (!keep_mean) {
       # TODO make sure the extra info sticks around
       result %<>% select(-{{ mean_name }})
     }
+    result %<>% dplyr_reconstruct(epi_data)
   }
   result %<>% ungroup()
-  result %<>% dplyr_reconstruct(epi_data)
 }
diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd
deleted file mode 100644
index 1dde416f..00000000
--- a/man/cache_metadata.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_transforms.R
-\name{cache_metadata}
-\alias{cache_metadata}
-\title{store the metadata in a easy to reapply way}
-\usage{
-cache_metadata(epi_data)
-}
-\arguments{
-\item{epi_data}{the epi_df}
-}
-\description{
-store the metadata in a easy to reapply way
-}
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index 96fc0df6..73b522ff 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -1,46 +1,79 @@
+library(dplyr)
 n_days <- 40
+removed_date <- 10
 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
-rand_vals <- rnorm(n_days)
+simple_dates <- simple_dates[-removed_date]
+rand_vals <- rnorm(n_days-1)
+
+# Two states, with 2 variables. a is linear, going up in one state and down in the other
+# b is just random
+# note that day 10 is missing
 epi_data <- epiprocess::as_epi_df(rbind(tibble(
   geo_value = "al",
   time_value = simple_dates,
-  a = 1:n_days,
+  a = 1:(n_days-1),
   b = rand_vals
 ), tibble(
   geo_value = "ca",
   time_value = simple_dates,
-  a = n_days:1,
+  a = (n_days-1):1,
   b = rand_vals + 10
 )))
 test_that("rolling_mean generates correct mean", {
   rolled <- rolling_mean(epi_data)
+  rolled
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
-  linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
-  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), linear_roll_mean)
+  linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 35.5, by = 1))
+  # day 10 is missing, so days 11-18 are thrown off
+  lag_st <- 10
+  unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st+1):(lag_st+1-6))),  mean(c((lag_st+2):(lag_st+2-6))), mean(c((lag_st+3):(lag_st+3-6))), mean(c((lag_st+4):(lag_st+4-6))), mean(c((lag_st+5):(lag_st+5-6))), mean(c((lag_st+6):(lag_st+6-6))))
+  # stitching the lag induced hiccup into the "normal" mean values
+  expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[17:(n_days-1)])
+
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), expected_mean)
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
-  linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
-  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean)
+  linear_reverse_roll_mean <- c(seq(from = 39, to = 35.5, by = -0.5), seq(from = 34.5, to = 4.5, by = -1))
+  lag_st <- 36
+  # day 10 is missing, so days 11-18 are thrown off
+  unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st-1):(lag_st-1-6))),  mean(c((lag_st-2):(lag_st-2-6))), mean(c((lag_st-3):(lag_st-3-6))), mean(c((lag_st-4):(lag_st-4-6))), mean(c((lag_st-5):(lag_st-5-6))), mean(c((lag_st-6):(lag_st-6-6))))
+  # stitching the lag induced hiccup into the "normal" mean values
+  expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[17:(n_days-1)])
+  # actually testing
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), expected_mean)
+
+  # making sure the type is maintained
   expect_true("epi_df" %in% class(rolled))
 })
 
 test_that("rolling_sd generates correct standard deviation", {
-  rolled <- rolling_sd(epi_data)
-  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28"))
+  rolled <- rolling_sd(epi_data,keep_mean = TRUE)
+  rolled %>% filter(geo_value == "al") %>% pull("a_m14")
+  rolled %>% filter(geo_value == "al")
+  rolled %>% filter(geo_value == "al") %>% pull("a")
+  rolled %>% filter(geo_value == "al") %>% pull("a_sd28")
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
-  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
+  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1))
+  linear_roll_mean
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean)
   # and the standard deviation is
-  linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28))
-  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
+  linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date-1)
+  linear_values <- 1:39
+  linear_values <- append(linear_values, NA, after = removed_date-1)
+  linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28))
+  # drop the extra date caused by the inclusion of the NAs
+  linear_roll_sd <- linear_roll_sd[-(removed_date)]
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd)
   # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
-  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_sd28"), linear_roll_sd)
   # doesn't break types
   expect_true("epi_df" %in% class(rolled))
 })
 
 test_that("get_trainable_names pulls out mean and sd columns", {
   rolled <- rolling_sd(epi_data, keep_mean = TRUE)
-  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28"))
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28"))
   expect_equal(get_trainable_names(rolled, NULL), c("a", "b"))
 })
 # TODO example with NA's, example with missing days, only one column, keep_mean
@@ -49,11 +82,11 @@ test_that("update_predictors keeps unmodified predictors", {
   epi_data["c"] = NaN
   epi_data["d"] = NaN
   epi_data["b_m14"] = NaN
-  epi_data["b_SD28"] = NaN
+  epi_data["b_sd28"] = NaN
   predictors <- c("a", "b", "c") # everything but d
   modified <- c("b", "c") # we want to exclude b but not its modified versions
-  expected_predictors <- c("a", "b_m14", "b_SD28")
+  expected_predictors <- c("a", "b_m14", "b_sd28")
   expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors)
-  expected_if_all_modified <- c("b_m14", "b_SD28")
+  expected_if_all_modified <- c("b_m14", "b_sd28")
   expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified)
 })

From 0b16c818336f546e5c8e6638851194f46b67ee83 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 20 Dec 2023 11:43:15 -0800
Subject: [PATCH 10/20] test: fix updated assumption

---
 tests/testthat/test-transforms.R | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index 73b522ff..7976984e 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -48,11 +48,7 @@ test_that("rolling_mean generates correct mean", {
 
 test_that("rolling_sd generates correct standard deviation", {
   rolled <- rolling_sd(epi_data,keep_mean = TRUE)
-  rolled %>% filter(geo_value == "al") %>% pull("a_m14")
-  rolled %>% filter(geo_value == "al")
-  rolled %>% filter(geo_value == "al") %>% pull("a")
-  rolled %>% filter(geo_value == "al") %>% pull("a_sd28")
-  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28"))
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
   linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1))
   linear_roll_mean

From 514219e79aae725b1f19406049c1fb570898e890 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 20 Dec 2023 11:55:34 -0800
Subject: [PATCH 11/20] test: make sure keep is off by default

---
 tests/testthat/test-transforms.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index 7976984e..4020d437 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -86,3 +86,8 @@ test_that("update_predictors keeps unmodified predictors", {
   expected_if_all_modified <- c("b_m14", "b_sd28")
   expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified)
 })
+
+test_that("rolling_sd doesn't keep the mean columns by default", {
+  rolled <- rolling_sd(epi_data)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_sd28", "b_sd28"))
+})

From b976103d758d45375b4c4a814ec1a7d667405425 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 20 Dec 2023 13:36:03 -0800
Subject: [PATCH 12/20] docs: slightly better

---
 R/data_transforms.R | 11 +++++++----
 R/targets_utils.R   |  9 ++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/R/data_transforms.R b/R/data_transforms.R
index b06f8779..ecf6fcf7 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -2,7 +2,7 @@
 
 #' extract the non-key, non-smoothed columns from epi_data
 #' @keywords internal
-#' @param epi_data the epi_data tibble
+#' @param epi_data the `epi_df`
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
 get_trainable_names <- function(epi_data, cols) {
   if (is.null(cols)) {
@@ -20,14 +20,17 @@ get_trainable_names <- function(epi_data, cols) {
 get_nonkey_names <- function(epi_data) {
   cols <- names(epi_data)
   cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+  return(cols)
 }
 
 
 #' update the predictors to only contain the smoothed/sd versions of cols
 #' @description
-#' should only be applied after both rolling_mean and rolling_sd
+#' modifies the list of preditors so that any which have been modified have the
+#'   modified versions included, and not the original. Should only be applied
+#'   after both rolling_mean and rolling_sd.
 #' @param epi_data the epi_df
-#' @param cols_modified the list of columns
+#' @param cols_modified the list of columns to modify. If this is `NULL`, that means we were modifying every column.
 #' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced
 #' @importFrom purrr map map_chr reduce
 #' @export
@@ -37,7 +40,7 @@ update_predictors <- function(epi_data, cols_modified, predictors) {
     other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`)
     other_predictors <- predictors[other_predictors]
   } else {
-    other_predictors <- c()
+    other_predictors <- character(0L)
   }
   # all the non-key names
   col_names <- get_nonkey_names(epi_data)
diff --git a/R/targets_utils.R b/R/targets_utils.R
index feac27c2..1743e5da 100644
--- a/R/targets_utils.R
+++ b/R/targets_utils.R
@@ -138,7 +138,14 @@ make_shared_grids <- function() {
     ),
     tidyr::expand_grid(
       forecaster = "flatline_fc",
-      ahead = 1:7
+      ahead = c(1:7, 14, 21, 28)
+    ),
+    tidyr::expand_grid(
+      forecaster = "smoothed_scaled",
+      trainer = c("quantreg"),
+      ahead = c(1:7, 14, 21, 28),
+      lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)),
+      pop_scaling = c(FALSE)
     )
   )
 }

From c4c7430aea12249948c29dcf4e29c5da99b07780 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 21 Dec 2023 15:29:47 -0800
Subject: [PATCH 13/20] continuing to clarify update_predictors

---
 R/data_transforms.R | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/R/data_transforms.R b/R/data_transforms.R
index ecf6fcf7..1ccb3ec9 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -29,27 +29,29 @@ get_nonkey_names <- function(epi_data) {
 #' modifies the list of preditors so that any which have been modified have the
 #'   modified versions included, and not the original. Should only be applied
 #'   after both rolling_mean and rolling_sd.
-#' @param epi_data the epi_df
-#' @param cols_modified the list of columns to modify. If this is `NULL`, that means we were modifying every column.
-#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced
+#' @param epi_data the epi_df, only included to get the non-key column names
+#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column.
+#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").
 #' @importFrom purrr map map_chr reduce
+#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
 #' @export
 update_predictors <- function(epi_data, cols_modified, predictors) {
   if (!is.null(cols_modified)) {
     # if cols_modified isn't null, make sure we include predictors that weren't modified
-    other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`)
-    other_predictors <- predictors[other_predictors]
+    unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`)
+    unchanged_predictors <- predictors[unchanged_predictors]
   } else {
-    other_predictors <- character(0L)
+    # if it's null, we've modified every predictor
+    unchanged_predictors <- character(0L)
   }
   # all the non-key names
   col_names <- get_nonkey_names(epi_data)
-  is_present <- function(x) {
-    grepl(x, col_names) & !(col_names %in% predictors)
+  is_present <- function(original_predictor) {
+    grepl(original_predictor, col_names) & !(col_names %in% predictors)
   }
   is_modified <- map(predictors, is_present) %>% reduce(`|`)
   new_predictors <- col_names[is_modified]
-  return(c(other_predictors, new_predictors))
+  return(c(unchanged_predictors, new_predictors))
 }
 
 #' get a rolling average for the named columns

From d6d4cddda61508ec630fe4b89ade3d054604d041 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 10:46:28 -0800
Subject: [PATCH 14/20] fix before behavior: mean tests simplified

---
 R/data_transforms.R              |  6 ++---
 tests/testthat/test-transforms.R | 42 +++++++++++++++++---------------
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/R/data_transforms.R b/R/data_transforms.R
index 1ccb3ec9..58eb306e 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -71,7 +71,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
     mean_name <- paste0(col, "_m", width)
-    epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width, new_col_name = mean_name)
+    epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width-1L, new_col_name = mean_name)
   }
   epi_data %<>% ungroup()
   return(epi_data)
@@ -102,8 +102,8 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     result %<>% group_by(geo_value)
     mean_name <- paste0(col, "_m", mean_width)
     sd_name <- paste0(col, "_sd", sd_width)
-    result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width, new_col_name = mean_name)
-    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width, new_col_name = sd_name)
+    result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width-1L, new_col_name = mean_name)
+    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width-1, new_col_name = sd_name)
     if (!keep_mean) {
       # TODO make sure the extra info sticks around
       result %<>% select(-{{ mean_name }})
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index 4020d437..e6b79ed8 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -3,7 +3,7 @@ n_days <- 40
 removed_date <- 10
 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
 simple_dates <- simple_dates[-removed_date]
-rand_vals <- rnorm(n_days-1)
+rand_vals <- rnorm(n_days - 1)
 
 # Two states, with 2 variables. a is linear, going up in one state and down in the other
 # b is just random
@@ -11,12 +11,12 @@ rand_vals <- rnorm(n_days-1)
 epi_data <- epiprocess::as_epi_df(rbind(tibble(
   geo_value = "al",
   time_value = simple_dates,
-  a = 1:(n_days-1),
+  a = 1:(n_days - 1),
   b = rand_vals
 ), tibble(
   geo_value = "ca",
   time_value = simple_dates,
-  a = (n_days-1):1,
+  a = (n_days - 1):1,
   b = rand_vals + 10
 )))
 test_that("rolling_mean generates correct mean", {
@@ -24,21 +24,23 @@ test_that("rolling_mean generates correct mean", {
   rolled
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
-  linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 35.5, by = 1))
-  # day 10 is missing, so days 11-18 are thrown off
-  lag_st <- 10
-  unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st+1):(lag_st+1-6))),  mean(c((lag_st+2):(lag_st+2-6))), mean(c((lag_st+3):(lag_st+3-6))), mean(c((lag_st+4):(lag_st+4-6))), mean(c((lag_st+5):(lag_st+5-6))), mean(c((lag_st+6):(lag_st+6-6))))
+  linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 7), seq(from = 5, to = 36, by = 1))
+  # day 10 is missing, so the average days 11-16 are thrown off, only using 6 values instead of 7
+  gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a)
+  unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):((gap_starts + d) - 5)))
   # stitching the lag induced hiccup into the "normal" mean values
-  expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[17:(n_days-1)])
+  expected_mean <- c(linear_roll_mean[1:9], unusual_days, linear_roll_mean[16:(n_days - 1)])
+  expected_mean
 
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), expected_mean)
+  # Doing the same for California
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
-  linear_reverse_roll_mean <- c(seq(from = 39, to = 35.5, by = -0.5), seq(from = 34.5, to = 4.5, by = -1))
-  lag_st <- 36
-  # day 10 is missing, so days 11-18 are thrown off
-  unusual_days <- c(mean(c((lag_st):(lag_st-6))), mean(c((lag_st-1):(lag_st-1-6))),  mean(c((lag_st-2):(lag_st-2-6))), mean(c((lag_st-3):(lag_st-3-6))), mean(c((lag_st-4):(lag_st-4-6))), mean(c((lag_st-5):(lag_st-5-6))), mean(c((lag_st-6):(lag_st-6-6))))
+  linear_reverse_roll_mean <- c(seq(from = 39, by = -0.5, length.out = 7), seq(from = 35, to = 4, by = -1))
+  # day 10 is missing, so days 11-16 are thrown off
+  gap_starts <- epi_data %>% filter(geo_value == "ca" & time_value == as.Date("2012-01-11")) %>% pull(a)
+  unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts - d) + 0):((gap_starts - d) + 5)))
   # stitching the lag induced hiccup into the "normal" mean values
-  expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[17:(n_days-1)])
+  expected_mean <- c(linear_reverse_roll_mean[1:9], unusual_days, linear_reverse_roll_mean[16:(n_days - 1)])
   # actually testing
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), expected_mean)
 
@@ -47,16 +49,16 @@ test_that("rolling_mean generates correct mean", {
 })
 
 test_that("rolling_sd generates correct standard deviation", {
-  rolled <- rolling_sd(epi_data,keep_mean = TRUE)
+  rolled <- rolling_sd(epi_data, keep_mean = TRUE)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
   linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1))
   linear_roll_mean
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean)
   # and the standard deviation is
-  linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date-1)
+  linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1)
   linear_values <- 1:39
-  linear_values <- append(linear_values, NA, after = removed_date-1)
+  linear_values <- append(linear_values, NA, after = removed_date - 1)
   linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28))
   # drop the extra date caused by the inclusion of the NAs
   linear_roll_sd <- linear_roll_sd[-(removed_date)]
@@ -75,10 +77,10 @@ test_that("get_trainable_names pulls out mean and sd columns", {
 # TODO example with NA's, example with missing days, only one column, keep_mean
 
 test_that("update_predictors keeps unmodified predictors", {
-  epi_data["c"] = NaN
-  epi_data["d"] = NaN
-  epi_data["b_m14"] = NaN
-  epi_data["b_sd28"] = NaN
+  epi_data["c"] <- NaN
+  epi_data["d"] <- NaN
+  epi_data["b_m14"] <- NaN
+  epi_data["b_sd28"] <- NaN
   predictors <- c("a", "b", "c") # everything but d
   modified <- c("b", "c") # we want to exclude b but not its modified versions
   expected_predictors <- c("a", "b_m14", "b_sd28")

From d542be27e21418b201e802d6ec6be027db9657bc Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 12:08:35 -0800
Subject: [PATCH 15/20] various suggestions from logan, before=n_points-1

---
 NAMESPACE                                |  1 +
 R/data_transforms.R                      |  6 +-
 R/forecaster_scaled_pop.R                |  2 +-
 R/forecaster_smoothed_scaled.R           | 72 +++++++++++++-----------
 R/targets_utils.R                        |  4 +-
 man/get_trainable_names.Rd               |  2 +-
 man/smoothed_scaled.Rd                   | 13 ++++-
 man/update_predictors.Rd                 | 13 +++--
 tests/testthat/test-forecasters-basics.R |  1 -
 tests/testthat/test-transforms.R         | 11 +++-
 10 files changed, 75 insertions(+), 50 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index c4dd11ea..e6f711c3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -113,3 +113,4 @@ importFrom(tidyr,drop_na)
 importFrom(tidyr,expand_grid)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
+importFrom(zeallot,"%<-%")
diff --git a/R/data_transforms.R b/R/data_transforms.R
index 58eb306e..7329823d 100644
--- a/R/data_transforms.R
+++ b/R/data_transforms.R
@@ -71,7 +71,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
     mean_name <- paste0(col, "_m", width)
-    epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width-1L, new_col_name = mean_name)
+    epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name)
   }
   epi_data %<>% ungroup()
   return(epi_data)
@@ -102,8 +102,8 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     result %<>% group_by(geo_value)
     mean_name <- paste0(col, "_m", mean_width)
     sd_name <- paste0(col, "_sd", sd_width)
-    result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width-1L, new_col_name = mean_name)
-    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width-1, new_col_name = sd_name)
+    result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name)
+    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name)
     if (!keep_mean) {
       # TODO make sure the extra info sticks around
       result %<>% select(-{{ mean_name }})
diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
index 6db0b8b0..2b5d0020 100644
--- a/R/forecaster_scaled_pop.R
+++ b/R/forecaster_scaled_pop.R
@@ -73,7 +73,7 @@ scaled_pop <- function(epi_data,
   args_input[["ahead"]] <- effective_ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(arx_args_list, args_input)
-  # if you want to ignore extra_sources, setting predictors is the way to do it
+  # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
   argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
index becfb3a3..b1c02275 100644
--- a/R/forecaster_smoothed_scaled.R
+++ b/R/forecaster_smoothed_scaled.R
@@ -14,8 +14,17 @@
 #'   to the max time value of the `epi_df`. how to handle this is a modelling
 #'   question left up to each forecaster; see latency_adjusting.R for the
 #'   existing examples)
-#' @param pop_scaling an example extra parameter unique to this forecaster
-#' @param trainer an example extra parameter that is fairly common
+#' @param pop_scaling bool; if `TRUE`, assume all numeric columns are on the
+#'   count scale and translate them to a rate scale for model fitting.
+#'   Predictions will be translated back to count scale. Any
+#'   `layer_residual_quantiles` (for non-`"quantile_reg"` `trainer`s) will be
+#'   done on the rate scale. When specifying predictor lags, note that rate
+#'   variables will use the same names as and overwrite the count variables.
+#'   Rates here will be counts per 100k population, based on
+#'   `epipredict::state_census`.
+#' @param trainer optional; parsnip model specification to use for the core
+#'   fitting & prediction (the `spec` of the internal
+#'   [`epipredict::epi_workflow`]).  Default is `parsnip::linear_reg()`.
 #' @param smooth_width the number of days over which to do smoothing. If `NULL`,
 #'   then no smoothing is applied.
 #' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
@@ -34,57 +43,52 @@
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
 #' @importFrom recipes all_numeric
+#' @importFrom zeallot %<-%
 #' @export
 smoothed_scaled <- function(epi_data,
-                          outcome,
-                          extra_sources = "",
-                          ahead = 1,
-                          pop_scaling = TRUE,
-                          trainer = parsnip::linear_reg(),
-                          quantile_levels = covidhub_probs(),
-                          smooth_width = 7,
-                          smooth_cols = NULL,
-                          sd_width = 28,
-                          sd_mean_width = 14,
-                          sd_cols = NULL,
-                          ...) {
+                            outcome,
+                            extra_sources = "",
+                            ahead = 1,
+                            pop_scaling = TRUE,
+                            trainer = parsnip::linear_reg(),
+                            quantile_levels = covidhub_probs(),
+                            smooth_width = 7,
+                            smooth_cols = NULL,
+                            sd_width = 28,
+                            sd_mean_width = 14,
+                            sd_cols = NULL,
+                            ...) {
   # perform any preprocessing not supported by epipredict
   # this is a temp fix until a real fix gets put into epipredict
   epi_data <- clear_lastminute_nas(epi_data)
   # one that every forecaster will need to handle: how to manage max(time_value)
   # that's older than the `as_of` date
-  epidataAhead <- extend_ahead(epi_data, ahead)
+  c(epi_data, effective_ahead) %<-% extend_ahead(epi_data, ahead)
   # see latency_adjusting for other examples
-  # this next part is basically unavoidable boilerplate you'll want to copy
-  epi_data <- epidataAhead[[1]]
-  effective_ahead <- epidataAhead[[2]]
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
   if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
-    null_result <- tibble(
-      geo_value = character(),
-      forecast_date = lubridate::Date(),
-      target_end_date = lubridate::Date(),
-      quantile = numeric(),
-      value = numeric()
-    )
+    null_result <- epi_data[0L, c("geo_value", attr(epi_data, "metadata", exact = TRUE)[["other_keys"]])] %>%
+      mutate(
+        forecast_date = epi_data$time_value[0],
+        target_end_date = epi_data$time_value[0],
+        quantile = numeric(),
+        value = numeric()
+      )
     return(null_result)
   }
   args_input[["ahead"]] <- effective_ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(arx_args_list, args_input)
-  # if you want to ignore extra_sources, setting predictors is the way to do it
+  # `extra_sources` sets which variables beyond the outcome are lagged and used as predictors
+  # any which are modified by `rolling_mean` or `rolling_sd` have their original values dropped later
   predictors <- c(outcome, extra_sources)
-  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
-  trainer <- argsPredictorsTrainer[[3]]
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
   # smoothing
-  keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
+  keep_mean <- !is.null(smooth_width) && !is.null(sd_mean_width) &&
+    smooth_width == sd_mean_width # do we (not) need to do the mean separately?
   if (!is.null(smooth_width) && !keep_mean) {
     epi_data %<>% rolling_mean(
       width = smooth_width,
@@ -101,8 +105,10 @@ smoothed_scaled <- function(epi_data,
       keep_mean = keep_mean
     )
   }
-  # and need to make sure we exclude the original varialbes as predictors
+  # and need to make sure we exclude the original variables as predictors
   predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
+  c(args_list, predictors, trainer) %<-% perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
   # preprocessing supported by epipredict
   preproc <- epi_recipe(epi_data)
   if (pop_scaling) {
diff --git a/R/targets_utils.R b/R/targets_utils.R
index 1743e5da..033a20b1 100644
--- a/R/targets_utils.R
+++ b/R/targets_utils.R
@@ -133,7 +133,7 @@ make_shared_grids <- function() {
       forecaster = "scaled_pop",
       trainer = c("linreg", "quantreg"),
       ahead = c(1:7, 14, 21, 28),
-      lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
+      lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)),
       pop_scaling = c(FALSE)
     ),
     tidyr::expand_grid(
@@ -144,7 +144,7 @@ make_shared_grids <- function() {
       forecaster = "smoothed_scaled",
       trainer = c("quantreg"),
       ahead = c(1:7, 14, 21, 28),
-      lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)),
+      lags = list(list(c(0, 3, 5, 7, 14), c(0),c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14), c(0,2,4,7,14,21,28)),
       pop_scaling = c(FALSE)
     )
   )
diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd
index 700c9216..8f5faf10 100644
--- a/man/get_trainable_names.Rd
+++ b/man/get_trainable_names.Rd
@@ -7,7 +7,7 @@
 get_trainable_names(epi_data, cols)
 }
 \arguments{
-\item{epi_data}{the epi_data tibble}
+\item{epi_data}{the \code{epi_df}}
 
 \item{cols}{vector of column names to use. If \code{NULL}, fill with all non-key columns}
 }
diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd
index 683c56d7..7306d3b3 100644
--- a/man/smoothed_scaled.Rd
+++ b/man/smoothed_scaled.Rd
@@ -34,9 +34,18 @@ to the max time value of the \code{epi_df}. how to handle this is a modelling
 question left up to each forecaster; see latency_adjusting.R for the
 existing examples)}
 
-\item{pop_scaling}{an example extra parameter unique to this forecaster}
+\item{pop_scaling}{bool; if \code{TRUE}, assume all numeric columns are on the
+count scale and translate them to a rate scale for model fitting.
+Predictions will be translated back to count scale. Any
+\code{layer_residual_quantiles} (for non-\code{"quantile_reg"} \code{trainer}s) will be
+done on the rate scale. When specifying predictor lags, note that rate
+variables will use the same names as and overwrite the count variables.
+Rates here will be counts per 100k population, based on
+\code{epipredict::state_census}.}
 
-\item{trainer}{an example extra parameter that is fairly common}
+\item{trainer}{optional; parsnip model specification to use for the core
+fitting & prediction (the \code{spec} of the internal
+\code{\link[epipredict:epi_workflow]{epipredict::epi_workflow}}).  Default is \code{parsnip::linear_reg()}.}
 
 \item{quantile_levels}{The quantile levels to predict. Defaults to those}
 
diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd
index 9b9e603c..435e5aba 100644
--- a/man/update_predictors.Rd
+++ b/man/update_predictors.Rd
@@ -7,12 +7,17 @@
 update_predictors(epi_data, cols_modified, predictors)
 }
 \arguments{
-\item{epi_data}{the epi_df}
+\item{epi_data}{the epi_df, only included to get the non-key column names}
 
-\item{cols_modified}{the list of columns}
+\item{cols_modified}{the list of columns which have been modified. If this is \code{NULL}, that means we were modifying every column.}
 
-\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced}
+\item{predictors}{the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").}
+}
+\value{
+returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
 }
 \description{
-should only be applied after both rolling_mean and rolling_sd
+modifies the list of preditors so that any which have been modified have the
+modified versions included, and not the original. Should only be applied
+after both rolling_mean and rolling_sd.
 }
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
index 785b6ad2..0c61a8d8 100644
--- a/tests/testthat/test-forecasters-basics.R
+++ b/tests/testthat/test-forecasters-basics.R
@@ -5,7 +5,6 @@ forecasters <- list(
   c("flatline_fc", flatline_fc),
   c("smoothed_scaled", smoothed_scaled)
 )
-forecaster <- forecasters[[3]]
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
index e6b79ed8..fdea75b2 100644
--- a/tests/testthat/test-transforms.R
+++ b/tests/testthat/test-transforms.R
@@ -52,14 +52,19 @@ test_that("rolling_sd generates correct standard deviation", {
   rolled <- rolling_sd(epi_data, keep_mean = TRUE)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
-  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1))
-  linear_roll_mean
+  linear_roll_mean <- c(seq(from = 1, to = 7, by = .5), seq(from = 8, to = 16, by = 1), seq(from = 16.5, to = 32.5, by = 1))
+##   linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 14), seq(from = 8.5, to = 32.5, by = 1))
+##   gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a)
+##   unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 14, 1)))
+##   map(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 13, 1)))
+##   linear_roll_mean
+## rolled %>% filter(geo_value == "al") %>% pull("a_m14")
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean)
   # and the standard deviation is
   linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1)
   linear_values <- 1:39
   linear_values <- append(linear_values, NA, after = removed_date - 1)
-  linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28))
+  linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28 - 1))
   # drop the extra date caused by the inclusion of the NAs
   linear_roll_sd <- linear_roll_sd[-(removed_date)]
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd)

From 16dd527a250e3a02072360a08652d5abe3383148 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 16:21:17 -0800
Subject: [PATCH 16/20] fix tests (sd lag should only be 0)

---
 DESCRIPTION                            | 3 ++-
 R/data_validation.R                    | 2 +-
 R/forecaster_smoothed_scaled.R         | 2 ++
 tests/testthat/test-forecasters-data.R | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 21fbcc91..01a09444 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -30,7 +30,8 @@ Imports:
     slider,
     targets,
     tibble,
-    tidyr
+    tidyr,
+    zeallot
 Suggests:
     ggplot2,
     knitr,
diff --git a/R/data_validation.R b/R/data_validation.R
index da4c134e..2a0838e5 100644
--- a/R/data_validation.R
+++ b/R/data_validation.R
@@ -56,7 +56,7 @@ perform_sanity_checks <- function(epi_data,
 #' @export
 confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
   if (!is.null(args_input$lags)) {
-    lag_max <- max(args_input$lags)
+    lag_max <- max(unlist(args_input$lags))
   } else {
     lag_max <- 14 # default value of 2 weeks
   }
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
index b1c02275..32a3929d 100644
--- a/R/forecaster_smoothed_scaled.R
+++ b/R/forecaster_smoothed_scaled.R
@@ -5,6 +5,8 @@
 #'   /target/ it still uses the raw value (this captures some of the noise).  It
 #'   also uses a rolling standard deviation as an auxillary signal, window of
 #'   withd `sd_width`, which by default is 28 days.
+#' If you are using `sd_width`, you should restrict the lags on the `sd` to only
+#'   include `0`, so set your lags to be e.g. `list(c(0,7,14), c(0))`.
 #' @param epi_data the actual data used
 #' @param outcome the name of the target variable
 #' @param extra_sources the name of any extra columns to use. This list could be
diff --git a/tests/testthat/test-forecasters-data.R b/tests/testthat/test-forecasters-data.R
index d12b4a40..4fc65f5d 100644
--- a/tests/testthat/test-forecasters-data.R
+++ b/tests/testthat/test-forecasters-data.R
@@ -7,7 +7,7 @@ forecasters <- tibble::tribble(
   scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"), "scaled_pop",
   scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"), "scaled_pop",
   flatline_fc, list(1), list("ahead"), "flatline_fc",
-  smoothed_scaled, list(1), list("ahead"), "smoothed_scaled"
+  smoothed_scaled, list(1, list(c(0,7,14), c(0))), list("ahead", "lags"), "smoothed_scaled"
 )
 expects_nonequal <- c("scaled_pop", "smoothed_scaled")
 synth_mean <- 25

From 4b34428ba2ae67e3587644eef124b2604ebaa011 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 16:31:48 -0800
Subject: [PATCH 17/20] include smoothed_scaling in the targets

---
 R/targets_utils.R    | 11 ++---------
 covid_hosp_explore.R |  7 +++++++
 flu_hosp_explore.R   | 10 +++++++++-
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/R/targets_utils.R b/R/targets_utils.R
index 033a20b1..d3fd435e 100644
--- a/R/targets_utils.R
+++ b/R/targets_utils.R
@@ -127,26 +127,19 @@ make_shared_grids <- function() {
       forecaster = "scaled_pop",
       trainer = c("linreg", "quantreg"),
       ahead = c(1:7, 14, 21, 28),
-      pop_scaling = c(FALSE)
+      pop_scaling = FALSE
     ),
     tidyr::expand_grid(
       forecaster = "scaled_pop",
       trainer = c("linreg", "quantreg"),
       ahead = c(1:7, 14, 21, 28),
       lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)),
-      pop_scaling = c(FALSE)
+      pop_scaling = FALSE
     ),
     tidyr::expand_grid(
       forecaster = "flatline_fc",
       ahead = c(1:7, 14, 21, 28)
     ),
-    tidyr::expand_grid(
-      forecaster = "smoothed_scaled",
-      trainer = c("quantreg"),
-      ahead = c(1:7, 14, 21, 28),
-      lags = list(list(c(0, 3, 5, 7, 14), c(0),c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14), c(0,2,4,7,14,21,28)),
-      pop_scaling = c(FALSE)
-    )
   )
 }
 #' Make list of common ensembles for forecasting experiments across projects
diff --git a/covid_hosp_explore.R b/covid_hosp_explore.R
index f9aa0d8e..456b4b3e 100644
--- a/covid_hosp_explore.R
+++ b/covid_hosp_explore.R
@@ -18,6 +18,13 @@ make_unique_grids <- function() {
       ahead = c(1:7, 14, 21, 28),
       lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
       pop_scaling = TRUE
+    ),
+    tidyr::expand_grid(
+      forecaster = "smoothed_scaled",
+      trainer = c("quantreg"),
+      ahead = c(1:7, 14, 21, 28),
+      lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))),
+      pop_scaling = TRUE
     )
   )
 }
diff --git a/flu_hosp_explore.R b/flu_hosp_explore.R
index a3486080..31d0ffc1 100644
--- a/flu_hosp_explore.R
+++ b/flu_hosp_explore.R
@@ -5,7 +5,15 @@ source("extras/targets-common.R")
 
 # Add custom parameter combinations in the list below.
 make_unique_grids <- function() {
-  list()
+  list(
+    tidyr::expand_grid(
+      forecaster = "smoothed_scaled",
+      trainer = c("quantreg"),
+      ahead = c(1:7, 14, 21, 28),
+      lags = list(list(c(0, 3, 5, 7, 14), c(0), c(0, 3, 5, 7, 14), c(0),), list(c(0, 7, 14), c(0)), list(c(0,2,4,7,14,21,28), c(0))),
+      pop_scaling = FALSE
+    )
+  )
 }
 make_unique_ensemble_grid <- function() {
   tribble(

From af3ff9535d3918d640f92a0280a077edde1b374b Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 16:41:46 -0800
Subject: [PATCH 18/20] perform_sanity_checks ->
 sanitize_args_predictors_trainer

---
 NAMESPACE                                            |  2 +-
 R/data_validation.R                                  |  2 +-
 R/forecaster_flatline.R                              |  4 +---
 R/forecaster_scaled_pop.R                            |  7 ++-----
 R/forecaster_smoothed_scaled.R                       |  4 ++--
 ...checks.Rd => sanitize_args_predictors_trainer.Rd} | 12 +++++++++---
 man/scaled_pop.Rd                                    |  2 +-
 man/smoothed_scaled.Rd                               |  4 +++-
 tests/testthat/test-forecaster-utils.R               |  6 +++---
 9 files changed, 23 insertions(+), 20 deletions(-)
 rename man/{perform_sanity_checks.Rd => sanitize_args_predictors_trainer.Rd} (85%)

diff --git a/NAMESPACE b/NAMESPACE
index e6f711c3..3298ff06 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -28,12 +28,12 @@ export(make_shared_grids)
 export(make_target_ensemble_grid)
 export(make_target_param_grid)
 export(overprediction)
-export(perform_sanity_checks)
 export(read_external_predictions_data)
 export(rolling_mean)
 export(rolling_sd)
 export(run_evaluation_measure)
 export(run_workflow_and_format)
+export(sanitize_args_predictors_trainer)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
diff --git a/R/data_validation.R b/R/data_validation.R
index 2a0838e5..e9319334 100644
--- a/R/data_validation.R
+++ b/R/data_validation.R
@@ -13,7 +13,7 @@
 #'   include empty strings
 #' @param args_list the args list created by [`epipredict::arx_args_list`]
 #' @export
-perform_sanity_checks <- function(epi_data,
+sanitize_args_predictors_trainer <- function(epi_data,
                                   outcome,
                                   predictors,
                                   trainer,
diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R
index ab0c5075..687511b0 100644
--- a/R/forecaster_flatline.R
+++ b/R/forecaster_flatline.R
@@ -40,9 +40,7 @@ flatline_fc <- function(epi_data,
   args_list <- do.call(flatline_args_list, args_input)
   # if you want to ignore extra_sources, setting predictors is the way to do it
   predictors <- c(outcome, extra_sources)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
+  c(args_list, predictors) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
index 2b5d0020..c849cd9e 100644
--- a/R/forecaster_scaled_pop.R
+++ b/R/forecaster_scaled_pop.R
@@ -35,7 +35,7 @@
 #' @param quantile_levels The quantile levels to predict. Defaults to those required by
 #'   covidhub.
 #' @seealso some utilities for making forecasters: [format_storage],
-#'   [perform_sanity_checks]
+#'   [sanitize_args_predictors_trainer]
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
 #' @importFrom recipes all_numeric
@@ -76,10 +76,7 @@ scaled_pop <- function(epi_data,
   # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
-  trainer <- argsPredictorsTrainer[[3]]
+  c(args_list, predictors, trainer) <- sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
index 32a3929d..8428cc08 100644
--- a/R/forecaster_smoothed_scaled.R
+++ b/R/forecaster_smoothed_scaled.R
@@ -41,7 +41,7 @@
 #' @param ... any additional arguments as used by [arx_args_list]
 #'   required by covidhub.
 #' @seealso some utilities for making forecasters: [format_storage],
-#'   [perform_sanity_checks]
+#'   [sanitize_args_predictors_trainer]
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
 #' @importFrom recipes all_numeric
@@ -110,7 +110,7 @@ smoothed_scaled <- function(epi_data,
   # and need to make sure we exclude the original variables as predictors
   predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  c(args_list, predictors, trainer) %<-% perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
+  c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
   # preprocessing supported by epipredict
   preproc <- epi_recipe(epi_data)
   if (pop_scaling) {
diff --git a/man/perform_sanity_checks.Rd b/man/sanitize_args_predictors_trainer.Rd
similarity index 85%
rename from man/perform_sanity_checks.Rd
rename to man/sanitize_args_predictors_trainer.Rd
index 152b3bf1..b56706ee 100644
--- a/man/perform_sanity_checks.Rd
+++ b/man/sanitize_args_predictors_trainer.Rd
@@ -1,10 +1,16 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/data_validation.R
-\name{perform_sanity_checks}
-\alias{perform_sanity_checks}
+\name{sanitize_args_predictors_trainer}
+\alias{sanitize_args_predictors_trainer}
 \title{helper function for those writing forecasters}
 \usage{
-perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
+sanitize_args_predictors_trainer(
+  epi_data,
+  outcome,
+  predictors,
+  trainer,
+  args_list
+)
 }
 \arguments{
 \item{epi_data}{the actual data used}
diff --git a/man/scaled_pop.Rd b/man/scaled_pop.Rd
index 880819eb..5522c85c 100644
--- a/man/scaled_pop.Rd
+++ b/man/scaled_pop.Rd
@@ -63,5 +63,5 @@ arguments:
 }
 \seealso{
 some utilities for making forecasters: \link{format_storage},
-\link{perform_sanity_checks}
+\link{sanitize_args_predictors_trainer}
 }
diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd
index 7306d3b3..16a3d2c0 100644
--- a/man/smoothed_scaled.Rd
+++ b/man/smoothed_scaled.Rd
@@ -73,8 +73,10 @@ the data. Even if the target is smoothed when used as a /predictor/, as a
 /target/ it still uses the raw value (this captures some of the noise).  It
 also uses a rolling standard deviation as an auxillary signal, window of
 withd \code{sd_width}, which by default is 28 days.
+If you are using \code{sd_width}, you should restrict the lags on the \code{sd} to only
+include \code{0}, so set your lags to be e.g. \code{list(c(0,7,14), c(0))}.
 }
 \seealso{
 some utilities for making forecasters: \link{format_storage},
-\link{perform_sanity_checks}
+\link{sanitize_args_predictors_trainer}
 }
diff --git a/tests/testthat/test-forecaster-utils.R b/tests/testthat/test-forecaster-utils.R
index 863acc80..dfe5f4ed 100644
--- a/tests/testthat/test-forecaster-utils.R
+++ b/tests/testthat/test-forecaster-utils.R
@@ -1,10 +1,10 @@
-test_that("perform_sanity_checks", {
+test_that("sanitize_args_predictors_trainer", {
   epi_data <- epipredict::case_death_rate_subset
   # don't need to test validate_forecaster_inputs as that's inherited
   # testing args_list inheritance
   ex_args <- arx_args_list()
-  expect_error(perform_sanity_checks(epi_data, "case_rate", c("case_rate"), 5, ex_args))
-  argsPredictors <- perform_sanity_checks(
+  expect_error(sanitize_args_predictors_trainer(epi_data, "case_rate", c("case_rate"), 5, ex_args))
+  argsPredictors <- sanitize_args_predictors_trainer(
     epi_data, "case_rate", c("case_rate", ""), parsnip::linear_reg(), ex_args
   )
   args_list <- argsPredictors[[1]]

From ebb9db3c92f57e3354dab6c777cb1a4f07708603 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 19:38:02 -0800
Subject: [PATCH 19/20] zeallot (%<-%) needs all args

---
 R/forecaster_flatline.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R
index 687511b0..33687240 100644
--- a/R/forecaster_flatline.R
+++ b/R/forecaster_flatline.R
@@ -40,7 +40,7 @@ flatline_fc <- function(epi_data,
   args_list <- do.call(flatline_args_list, args_input)
   # if you want to ignore extra_sources, setting predictors is the way to do it
   predictors <- c(outcome, extra_sources)
-  c(args_list, predictors) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list)
+  c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict

From 15d35d784256cf82de8d0c54645e66e12bd789d4 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 22 Dec 2023 19:59:23 -0800
Subject: [PATCH 20/20] fix %<-% usage

---
 R/forecaster_scaled_pop.R                | 3 ++-
 tests/testthat/test-forecasters-basics.R | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
index c849cd9e..6c138caa 100644
--- a/R/forecaster_scaled_pop.R
+++ b/R/forecaster_scaled_pop.R
@@ -38,6 +38,7 @@
 #'   [sanitize_args_predictors_trainer]
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
+#' @importFrom zeallot %<-%
 #' @importFrom recipes all_numeric
 #' @export
 scaled_pop <- function(epi_data,
@@ -76,7 +77,7 @@ scaled_pop <- function(epi_data,
   # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  c(args_list, predictors, trainer) <- sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
+  c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
index 0c61a8d8..b0cc1da4 100644
--- a/tests/testthat/test-forecasters-basics.R
+++ b/tests/testthat/test-forecasters-basics.R
@@ -5,6 +5,7 @@ forecasters <- list(
   c("flatline_fc", flatline_fc),
   c("smoothed_scaled", smoothed_scaled)
 )
+forecaster <- c("scaled_pop", scaled_pop)
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%