-
Notifications
You must be signed in to change notification settings - Fork 0
Add the 7dav we talked about along with the std #76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
e4b3d45
4a78810
1edc6f6
e63be89
ea335c9
8df5f05
36608f6
0eb61e3
c50249b
0b16c81
514219e
b976103
c4c7430
d6d4cdd
d542be2
16dd527
4b34428
af3ff95
ebb9db3
15d35d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ Imports: | |
purrr, | ||
recipes (>= 1.0.4), | ||
rlang, | ||
slider, | ||
targets, | ||
tibble, | ||
tidyr | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# various reusable transforms to apply before handing to epipredict | ||
|
||
#' extract the non-key, non-smoothed columns from epi_data | ||
#' @keywords internal | ||
#' @param epi_data the epi_data tibble | ||
#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns | ||
get_trainable_names <- function(epi_data, cols) { | ||
if (is.null(cols)) { | ||
cols <- get_nonkey_names(epi_data) | ||
# exclude anything with the same naming schema as the rolling average/sd created below | ||
cols <- cols[!grepl("_\\w{1,2}\\d+", cols)] | ||
} | ||
return(cols) | ||
} | ||
|
||
#' just the names which aren't keys for an epi_df | ||
#' @description | ||
#' names, but it excludes keys | ||
#' @param epi_data the epi_df | ||
get_nonkey_names <- function(epi_data) { | ||
cols <- names(epi_data) | ||
cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))] | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
|
||
#' update the predictors to only contain the smoothed/sd versions of cols | ||
#' @description | ||
#' should only be applied after both rolling_mean and rolling_sd | ||
#' @param epi_data the epi_df | ||
#' @param cols_modified the list of columns | ||
#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced | ||
#' @importFrom purrr map map_chr reduce | ||
#' @export | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
update_predictors <- function(epi_data, cols_modified, predictors) { | ||
if (!is.null(cols_modified)) { | ||
# if cols_modified isn't null, make sure we include predictors that weren't modified | ||
other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
other_predictors <- predictors[other_predictors] | ||
} else { | ||
other_predictors <- c() | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
# all the non-key names | ||
col_names <- get_nonkey_names(epi_data) | ||
is_present <- function(x) { | ||
grepl(x, col_names) & !(col_names %in% predictors) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
is_modified <- map(predictors, is_present) %>% reduce(`|`) | ||
new_predictors <- col_names[is_modified] | ||
return(c(other_predictors, new_predictors)) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
#' get a rolling average for the named columns | ||
#' @description | ||
#' add column(s) that are the rolling means of the specified columns, as | ||
#' implemented by slider. Defaults to the previous 7 days. | ||
#' Currently only group_by's on the geo_value. Should probably extend to more | ||
#' keys if you have them | ||
#' @param epi_data the dataset | ||
#' @param width the number of days (or examples, the sliding isn't time-aware) to use | ||
#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all | ||
#' @importFrom slider slide_dbl | ||
#' @export | ||
rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) { | ||
cols_to_mean <- get_trainable_names(epi_data, cols_to_mean) | ||
epi_data %<>% group_by(geo_value) | ||
for (col in cols_to_mean) { | ||
mean_name <- paste0(col, "_m", width) | ||
epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width)) | ||
} | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
epi_data %<>% ungroup() | ||
return(epi_data) | ||
} | ||
|
||
#' store the metadata in a easy to reapply way | ||
#' @description | ||
#' store the metadata in a easy to reapply way | ||
#' @param epi_data the epi_df | ||
#' @importFrom purrr list_modify | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cache_metadata <- function(epi_data) { | ||
features <- list() | ||
all_others <- attributes(epi_data)$metadata | ||
all_others["geo_type"] <- NULL | ||
all_others["time_type"] <- NULL | ||
all_others["as_of"] <- NULL | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (length(all_others) == 0) { | ||
all_others <- list() | ||
} | ||
features <- list( | ||
as_of = attributes(epi_data)$metadata$as_of, | ||
geo_type = attributes(epi_data)$metadata$geo_type, | ||
time_type = attributes(epi_data)$metadata$time_type, all_others = all_others | ||
) | ||
return(features) | ||
} | ||
|
||
#' get a rolling standard deviation for the named columns | ||
#' @description | ||
#' A rolling standard deviation, based off of a rolling mean. First it | ||
#' calculates a rolling mean with width `mean_width`, and then squares the | ||
#' difference between that and the actual value, averaged over `sd_width`. | ||
#' @param epi_data the dataset | ||
#' @param sd_width the number of days (or examples, the sliding isn't | ||
#' time-aware) to use for the standard deviation calculation | ||
#' @param mean_width like `sd_width`, but it governs the mean. Should be less | ||
#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width` | ||
#' (so 14 in the complete default case) | ||
#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all | ||
#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column | ||
#' @importFrom slider slide_dbl slide2_dbl | ||
#' @export | ||
rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) { | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (is.null(mean_width)) { | ||
mean_width <- as.integer(ceiling(sd_width / 2)) | ||
} | ||
cols_to_sd <- get_trainable_names(epi_data, cols_to_sd) | ||
metadata <- cache_metadata(epi_data) | ||
epi_data %<>% group_by(geo_value) | ||
for (col in cols_to_sd) { | ||
mean_name <- paste0(col, "_m", mean_width) | ||
sd_name <- paste0(col, "_SD", sd_width) | ||
epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width)) | ||
epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width)) | ||
if (!keep_mean) { | ||
# TODO make sure the extra info sticks around | ||
epi_data %<>% select(-{{ mean_name }}) | ||
} | ||
epi_data %<>% as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others) | ||
} | ||
epi_data %<>% ungroup() | ||
return(epi_data) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#' predict on smoothed data and the standard deviation | ||
#' @description | ||
#' This is a variant of `scaled_pop`, which predicts on a smoothed version of | ||
#' the data. Even if the target is smoothed when used as a /predictor/, as a | ||
#' /target/ it still uses the raw value (this captures some of the noise). It | ||
#' also uses a rolling standard deviation as an auxillary signal, window of | ||
#' withd `sd_width`, which by default is 28 days. | ||
#' @param epi_data the actual data used | ||
#' @param outcome the name of the target variable | ||
#' @param extra_sources the name of any extra columns to use. This list could be | ||
#' empty | ||
#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is | ||
#' likely *not* the same as the `ahead` used by epipredict, which is relative | ||
#' to the max time value of the `epi_df`. how to handle this is a modelling | ||
#' question left up to each forecaster; see latency_adjusting.R for the | ||
#' existing examples) | ||
#' @param pop_scaling an example extra parameter unique to this forecaster | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#' @param trainer an example extra parameter that is fairly common | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#' @param smooth_width the number of days over which to do smoothing. If `NULL`, | ||
#' then no smoothing is applied. | ||
#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths | ||
#' everything | ||
#' @param sd_width the number of days over which to take a moving average of the | ||
#' standard deviation. If `NULL`, the sd_width isn't included. | ||
#' @param sd_mean_width to calculate the sd, we need a window size for the mean | ||
#' used. | ||
#' @param sd_cols the names of the columns to smooth. If `NULL` its includes | ||
#' the sd of everything | ||
#' @param quantile_levels The quantile levels to predict. Defaults to those | ||
#' required by covidhub. | ||
#' @seealso some utilities for making forecasters: [format_storage], | ||
#' [perform_sanity_checks] | ||
#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling | ||
#' @importFrom tibble tibble | ||
#' @importFrom recipes all_numeric | ||
#' @export | ||
smoothed_scaled <- function(epi_data, | ||
outcome, | ||
extra_sources = "", | ||
ahead = 1, | ||
pop_scaling = TRUE, | ||
trainer = parsnip::linear_reg(), | ||
quantile_levels = covidhub_probs(), | ||
smooth_width = 7, | ||
smooth_cols = NULL, | ||
sd_width = 28, | ||
sd_mean_width = 14, | ||
sd_cols = NULL, | ||
...) { | ||
# perform any preprocessing not supported by epipredict | ||
# this is a temp fix until a real fix gets put into epipredict | ||
epi_data <- clear_lastminute_nas(epi_data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: why do we want to clear There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lastminute here means There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, that makes sense, though the current |
||
# one that every forecaster will need to handle: how to manage max(time_value) | ||
# that's older than the `as_of` date | ||
epidataAhead <- extend_ahead(epi_data, ahead) | ||
# see latency_adjusting for other examples | ||
# this next part is basically unavoidable boilerplate you'll want to copy | ||
epi_data <- epidataAhead[[1]] | ||
effective_ahead <- epidataAhead[[2]] | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
args_input <- list(...) | ||
# edge case where there is no data or less data than the lags; eventually epipredict will handle this | ||
if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) { | ||
null_result <- tibble( | ||
geo_value = character(), | ||
forecast_date = lubridate::Date(), | ||
target_end_date = lubridate::Date(), | ||
quantile = numeric(), | ||
value = numeric() | ||
) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return(null_result) | ||
} | ||
args_input[["ahead"]] <- effective_ahead | ||
args_input[["quantile_levels"]] <- quantile_levels | ||
args_list <- do.call(arx_args_list, args_input) | ||
# if you want to ignore extra_sources, setting predictors is the way to do it | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
predictors <- c(outcome, extra_sources) | ||
# TODO: Partial match quantile_level coming from here (on Dmitry's machine) | ||
argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
args_list <- argsPredictorsTrainer[[1]] | ||
predictors <- argsPredictorsTrainer[[2]] | ||
trainer <- argsPredictorsTrainer[[3]] | ||
# end of the copypasta | ||
# finally, any other pre-processing (e.g. smoothing) that isn't performed by | ||
# epipredict | ||
# smoothing | ||
keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately? | ||
if (!is.null(smooth_width) && !keep_mean) { | ||
epi_data %<>% rolling_mean( | ||
width = smooth_width, | ||
cols_to_mean = smooth_cols | ||
) | ||
} | ||
|
||
# measuring standard deviation | ||
if (!is.null(sd_width)) { | ||
epi_data %<>% rolling_sd( | ||
sd_width = sd_width, | ||
mean_width = sd_mean_width, | ||
cols_to_sd = sd_cols, | ||
keep_mean = keep_mean | ||
) | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
# and need to make sure we exclude the original varialbes as predictors | ||
dsweber2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: Do we always want to do this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lags are applied to all predictors, and are specified per-model in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we may want to apply different lagsets to the 7dav (7*(0:4) ish) and the sd covariate (maybe just 0) & ignore the original non-7dav'd predictors. Is that possible via the list-of-vectors form of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this is what the
Apparently, though I haven't actually tried it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like the current validation will probably complain if we try it. issue: |
||
# preprocessing supported by epipredict | ||
preproc <- epi_recipe(epi_data) | ||
if (pop_scaling) { | ||
preproc %<>% step_population_scaling( | ||
all_numeric(), | ||
df = epipredict::state_census, | ||
df_pop_col = "pop", | ||
create_new = FALSE, | ||
rate_rescaling = 1e5, | ||
by = c("geo_value" = "abbr") | ||
) | ||
} | ||
preproc %<>% arx_preprocess(outcome, predictors, args_list) | ||
|
||
# postprocessing supported by epipredict | ||
postproc <- frosting() | ||
postproc %<>% arx_postprocess(trainer, args_list) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's intended behavior. Both of those are set through adjusting
Yeah, I should probably just completely refactor the way I wrote There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think a quick fix here would be to just make sure |
||
if (pop_scaling) { | ||
postproc %<>% layer_population_scaling( | ||
.pred, .pred_distn, | ||
df = epipredict::state_census, | ||
df_pop_col = "pop", | ||
create_new = FALSE, | ||
rate_rescaling = 1e5, | ||
by = c("geo_value" = "abbr") | ||
) | ||
} | ||
# with all the setup done, we execute and format | ||
pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data) | ||
# now pred has the columns | ||
# (geo_value, forecast_date, target_end_date, quantile, value) | ||
# finally, any postprocessing not supported by epipredict e.g. calibration | ||
return(pred) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.