Open
Description
See how the predictor selection varies when we use "analogous" workflows with vs. without population scaling (based on the example for step_population_scaling
):
library(epiprocess)
#>
#> Attaching package: 'epiprocess'
#> The following object is masked from 'package:stats':
#>
#> filter
library(epipredict)
#> Loading required package: parsnip
jhu <- epiprocess::jhu_csse_daily_subset %>%
dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
dplyr::select(geo_value, time_value, cases)
pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
r <- epi_recipe(jhu) %>%
step_population_scaling(df = pop_data,
df_pop_col = "value",
by = c("geo_value" = "states"),
cases, suffix = "_scaled") %>%
step_epi_lag(cases_scaled, lag = c(7, 14)) %>%
step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
step_epi_naomit()
f <- frosting() %>%
layer_predict() %>%
layer_threshold(.pred) %>%
layer_naomit(.pred) %>%
layer_population_scaling(.pred, df = pop_data,
by = c("geo_value" = "states"),
df_pop_col = "value")
wf <- epi_workflow(r, parsnip::linear_reg()) %>%
fit(jhu) %>%
add_frosting(f)
names(coef(extract_fit_engine(wf)))
#> [1] "(Intercept)" "cases_scaled" "lag_7_cases_scaled"
#> [4] "lag_14_cases_scaled"
r2 <- epi_recipe(jhu) %>%
step_epi_lag(cases, lag = c(7, 14)) %>%
step_epi_ahead(cases, ahead = 7, role = "outcome") %>%
step_epi_naomit()
f2 <- frosting() %>%
layer_predict() %>%
layer_threshold(.pred) %>%
layer_naomit(.pred)
wf2 <- epi_workflow(r2, parsnip::linear_reg()) %>%
fit(jhu) %>%
add_frosting(f2)
names(coef(extract_fit_engine(wf2)))
#> [1] "(Intercept)" "lag_7_cases" "lag_14_cases"
Created on 2023-04-07 by the reprex package (v2.0.1)
In this case, the workflow with scaling has a lag-0 predictor, while the one without does not. If, instead, we were to use lag = c(0, 7, 14)
, then both versions would have a lag-0 predictor, but the one with scaling would have two copies of it, and might cause issues for some engines.