cmu-delphi · dsweber2 · Dec 23, 2023 · Dec 9, 2023 · Dec 11, 2023 · Dec 13, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,6 +27,7 @@ Imports:
     purrr,
     recipes (>= 1.0.4),
     rlang,
+    slider,
     targets,
     tibble,
     tidyr

diff --git a/NAMESPACE b/NAMESPACE
@@ -30,13 +30,17 @@ export(make_target_param_grid)
 export(overprediction)
 export(perform_sanity_checks)
 export(read_external_predictions_data)
+export(rolling_mean)
+export(rolling_sd)
 export(run_evaluation_measure)
 export(run_workflow_and_format)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
+export(smoothed_scaled)
 export(underprediction)
+export(update_predictors)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
 importFrom(cli,cli_abort)
@@ -85,9 +89,12 @@ importFrom(epiprocess,epix_slide)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
+importFrom(purrr,list_modify)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
+importFrom(purrr,map_chr)
 importFrom(purrr,map_vec)
+importFrom(purrr,reduce)
 importFrom(purrr,transpose)
 importFrom(recipes,all_numeric)
 importFrom(rlang,"!!")
@@ -96,6 +103,8 @@ importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
+importFrom(slider,slide2_dbl)
+importFrom(slider,slide_dbl)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
 importFrom(targets,tar_read)

diff --git a/R/data_transforms.R b/R/data_transforms.R
@@ -0,0 +1,131 @@
+# various reusable transforms to apply before handing to epipredict
+
+#' extract the non-key, non-smoothed columns from epi_data
+#' @keywords internal
+#' @param epi_data the epi_data tibble
+#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
+get_trainable_names <- function(epi_data, cols) {
+  if (is.null(cols)) {
+    cols <- get_nonkey_names(epi_data)
+    # exclude anything with the same naming schema as the rolling average/sd created below
+    cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
+  }
+  return(cols)
+}
+
+#' just the names which aren't keys for an epi_df
+#' @description
+#' names, but it excludes keys
+#' @param epi_data the epi_df
+get_nonkey_names <- function(epi_data) {
+  cols <- names(epi_data)
+  cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+}
+
+
+#' update the predictors to only contain the smoothed/sd versions of cols
+#' @description
+#' should only be applied after both rolling_mean and rolling_sd
+#' @param epi_data the epi_df
+#' @param cols_modified the list of columns
+#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced
+#' @importFrom purrr map map_chr reduce
+#' @export
+update_predictors <- function(epi_data, cols_modified, predictors) {
+  if (!is.null(cols_modified)) {
+    # if cols_modified isn't null, make sure we include predictors that weren't modified
+    other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`)
+    other_predictors <- predictors[other_predictors]
+  } else {
+    other_predictors <- c()
+  }
+  # all the non-key names
+  col_names <- get_nonkey_names(epi_data)
+  is_present <- function(x) {
+    grepl(x, col_names) & !(col_names %in% predictors)
+  }
+  is_modified <- map(predictors, is_present) %>% reduce(`|`)
+  new_predictors <- col_names[is_modified]
+  return(c(other_predictors, new_predictors))
+}
+
+#' get a rolling average for the named columns
+#' @description
+#' add column(s) that are the rolling means of the specified columns, as
+#'   implemented by slider. Defaults to the previous 7 days.
+#' Currently only group_by's on the geo_value. Should probably extend to more
+#'   keys if you have them
+#' @param epi_data the dataset
+#' @param width the number of days (or examples, the sliding isn't time-aware) to use
+#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
+#' @importFrom slider slide_dbl
+#' @export
+rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
+  cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_mean) {
+    mean_name <- paste0(col, "_m", width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
+
+#' store the metadata in a easy to reapply way
+#' @description
+#' store the metadata in a easy to reapply way
+#' @param epi_data the epi_df
+#' @importFrom purrr list_modify
+cache_metadata <- function(epi_data) {
+  features <- list()
+  all_others <- attributes(epi_data)$metadata
+  all_others["geo_type"] <- NULL
+  all_others["time_type"] <- NULL
+  all_others["as_of"] <- NULL
+  if (length(all_others) == 0) {
+   all_others <- list()
+  }
+  features <- list(
+    as_of = attributes(epi_data)$metadata$as_of,
+    geo_type = attributes(epi_data)$metadata$geo_type,
+    time_type = attributes(epi_data)$metadata$time_type, all_others = all_others
+  )
+  return(features)
+}
+
+#' get a rolling standard deviation for the named columns
+#' @description
+#' A rolling standard deviation, based off of a rolling mean. First it
+#'   calculates a rolling mean with width `mean_width`, and then squares the
+#'   difference between that and the actual value, averaged over `sd_width`.
+#' @param epi_data the dataset
+#' @param sd_width the number of days (or examples, the sliding isn't
+#'   time-aware) to use for the standard deviation calculation
+#' @param mean_width like `sd_width`, but it governs the mean. Should be less
+#'   than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
+#'   (so 14 in the complete default case)
+#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
+#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
+#' @importFrom slider slide_dbl slide2_dbl
+#' @export
+rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
+  if (is.null(mean_width)) {
+    mean_width <- as.integer(ceiling(sd_width / 2))
+  }
+  cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  metadata <- cache_metadata(epi_data)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_sd) {
+    mean_name <- paste0(col, "_m", mean_width)
+    sd_name <- paste0(col, "_SD", sd_width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
+    epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
+    if (!keep_mean) {
+      # TODO make sure the extra info sticks around
+      epi_data %<>% select(-{{ mean_name }})
+    }
+    epi_data %<>%  as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others)
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R
@@ -2,25 +2,25 @@
 #' add the default steps for arx_forecaster
 #' @description
 #' add the default steps for arx_forecaster
-#' @param rec an [`epipredict::epi_recipe`]
+#' @param preproc an [`epipredict::epi_recipe`]
 #' @param outcome a character of the column to be predicted
 #' @param predictors a character vector of the columns used as predictors
 #' @param args_list an [`epipredict::arx_args_list`]
 #' @seealso [arx_postprocess] for the layer equivalent
 #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
 #' @export
-arx_preprocess <- function(rec, outcome, predictors, args_list) {
+arx_preprocess <- function(preproc, outcome, predictors, args_list) {
   # input already validated
   lags <- args_list$lags
   for (l in seq_along(lags)) {
     p <- predictors[l]
-    rec %<>% step_epi_lag(!!p, lag = lags[[l]])
+    preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
   }
-  rec %<>%
+  preproc %<>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training)
-  return(rec)
+  return(preproc)
 }
 
 # TODO replace with `layer_arx_forecaster`

diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
@@ -0,0 +1,138 @@
+#' predict on smoothed data and the standard deviation
+#' @description
+#' This is a variant of `scaled_pop`, which predicts on a smoothed version of
+#'   the data. Even if the target is smoothed when used as a /predictor/, as a
+#'   /target/ it still uses the raw value (this captures some of the noise).  It
+#'   also uses a rolling standard deviation as an auxillary signal, window of
+#'   withd `sd_width`, which by default is 28 days.
+#' @param epi_data the actual data used
+#' @param outcome the name of the target variable
+#' @param extra_sources the name of any extra columns to use. This list could be
+#'   empty
+#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is
+#'   likely *not* the same as the `ahead` used by epipredict, which is relative
+#'   to the max time value of the `epi_df`. how to handle this is a modelling
+#'   question left up to each forecaster; see latency_adjusting.R for the
+#'   existing examples)
+#' @param pop_scaling an example extra parameter unique to this forecaster
+#' @param trainer an example extra parameter that is fairly common
+#' @param smooth_width the number of days over which to do smoothing. If `NULL`,
+#'   then no smoothing is applied.
+#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
+#'   everything
+#' @param sd_width the number of days over which to take a moving average of the
+#'   standard deviation. If `NULL`, the sd_width isn't included.
+#' @param sd_mean_width to calculate the sd, we need a window size for the mean
+#'   used.
+#' @param sd_cols the names of the columns to smooth. If `NULL` its includes
+#'   the sd of everything
+#' @param quantile_levels The quantile levels to predict. Defaults to those
+#'   required by covidhub.
+#' @seealso some utilities for making forecasters: [format_storage],
+#'   [perform_sanity_checks]
+#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
+#' @importFrom tibble tibble
+#' @importFrom recipes all_numeric
+#' @export
+smoothed_scaled <- function(epi_data,
+                          outcome,
+                          extra_sources = "",
+                          ahead = 1,
+                          pop_scaling = TRUE,
+                          trainer = parsnip::linear_reg(),
+                          quantile_levels = covidhub_probs(),
+                          smooth_width = 7,
+                          smooth_cols = NULL,
+                          sd_width = 28,
+                          sd_mean_width = 14,
+                          sd_cols = NULL,
+                          ...) {
+  # perform any preprocessing not supported by epipredict
+  # this is a temp fix until a real fix gets put into epipredict
+  epi_data <- clear_lastminute_nas(epi_data)
+  # one that every forecaster will need to handle: how to manage max(time_value)
+  # that's older than the `as_of` date
+  epidataAhead <- extend_ahead(epi_data, ahead)
+  # see latency_adjusting for other examples
+  # this next part is basically unavoidable boilerplate you'll want to copy
+  epi_data <- epidataAhead[[1]]
+  effective_ahead <- epidataAhead[[2]]
+  args_input <- list(...)
+  # edge case where there is no data or less data than the lags; eventually epipredict will handle this
+  if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
+    null_result <- tibble(
+      geo_value = character(),
+      forecast_date = lubridate::Date(),
+      target_end_date = lubridate::Date(),
+      quantile = numeric(),
+      value = numeric()
+    )
+    return(null_result)
+  }
+  args_input[["ahead"]] <- effective_ahead
+  args_input[["quantile_levels"]] <- quantile_levels
+  args_list <- do.call(arx_args_list, args_input)
+  # if you want to ignore extra_sources, setting predictors is the way to do it
+  predictors <- c(outcome, extra_sources)
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
+  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
+  args_list <- argsPredictorsTrainer[[1]]
+  predictors <- argsPredictorsTrainer[[2]]
+  trainer <- argsPredictorsTrainer[[3]]
+  # end of the copypasta
+  # finally, any other pre-processing (e.g. smoothing) that isn't performed by
+  # epipredict
+  # smoothing
+  keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
+  if (!is.null(smooth_width) && !keep_mean) {
+    epi_data %<>% rolling_mean(
+      width = smooth_width,
+      cols_to_mean = smooth_cols
+    )
+  }
+
+  # measuring standard deviation
+  if (!is.null(sd_width)) {
+    epi_data %<>% rolling_sd(
+      sd_width = sd_width,
+      mean_width = sd_mean_width,
+      cols_to_sd = sd_cols,
+      keep_mean = keep_mean
+    )
+  }
+  # and need to make sure we exclude the original varialbes as predictors
+  predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
+  # preprocessing supported by epipredict
+  preproc <- epi_recipe(epi_data)
+  if (pop_scaling) {
+    preproc %<>% step_population_scaling(
+      all_numeric(),
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  preproc %<>% arx_preprocess(outcome, predictors, args_list)
+
+  # postprocessing supported by epipredict
+  postproc <- frosting()
+  postproc %<>% arx_postprocess(trainer, args_list)
+  if (pop_scaling) {
+    postproc %<>% layer_population_scaling(
+      .pred, .pred_distn,
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  # with all the setup done, we execute and format
+  pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data)
+  # now pred has the columns
+  # (geo_value, forecast_date, target_end_date, quantile, value)
+  # finally, any postprocessing not supported by epipredict e.g. calibration
+  return(pred)
+}
diff --git a/app.R b/app.R
@@ -170,7 +170,7 @@ shinyApp(
           verticalLayout(
             plotlyOutput("main_plot", height = "90em"),
             h2("forecaster name -> parameters"),
-            #textOutput("forecaster_param_title"),
+            # textOutput("forecaster_param_title"),
             dataTableOutput("forecaster_table"),
             h2("ensemble name -> parameters"),
             dataTableOutput("ensemble_table")
@@ -261,7 +261,7 @@ shinyApp(
         # Use scatterplot or lines depending on the x var.
         {
           if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) {
-            . + geom_point(aes(size = n)) + expand_limits(size = 0)
+            . + geom_point(aes(size = n / 100)) + expand_limits(size = 0)
           } else {
             . + geom_line()
           }

diff --git a/man/arx_preprocess.Rd b/man/arx_preprocess.Rd
diff --git a/man/cache_metadata.Rd b/man/cache_metadata.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,7 @@ Imports: @@
         purrr,
         recipes (>= 1.0.4),
         rlang,
+        slider,
         targets,
         tibble,
         tidyr
@@ Expand Down @@