cmu-delphi
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 3 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎R/autoplot.R‎
Lines changed: 2 additions & 3 deletions b/‎R/autoplot.R‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎R/epipredict-package.R‎
Lines changed: 2 additions & 0 deletions b/‎R/epipredict-package.R‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/key_colnames.R‎
Lines changed: 13 additions & 8 deletions b/‎R/key_colnames.R‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎R/layer_population_scaling.R‎
Lines changed: 34 additions & 7 deletions b/‎R/layer_population_scaling.R‎
Lines changed: 34 additions & 7 deletions
diff --git a/‎R/make_quantile_reg.R‎
Lines changed: 1 addition & 1 deletion b/‎R/make_quantile_reg.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/step_population_scaling.R‎
Lines changed: 75 additions & 15 deletions b/‎R/step_population_scaling.R‎
Lines changed: 75 additions & 15 deletions
diff --git a/‎R/utils-latency.R‎
Lines changed: 1 addition & 0 deletions b/‎R/utils-latency.R‎
Lines changed: 1 addition & 0 deletions
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.1.3
+Version: 0.1.4
 Authors@R: c(
     person("Daniel J.", "McDonald", , "[email protected]", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "[email protected]", role = "aut"),
 
@@ -242,6 +242,7 @@ importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_at)
+importFrom(dplyr,inner_join)
 importFrom(dplyr,join_by)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
@@ -273,6 +274,7 @@ importFrom(hardhat,extract_recipe)
 importFrom(hardhat,refresh_blueprint)
 importFrom(hardhat,run_mold)
 importFrom(magrittr,"%>%")
+importFrom(magrittr,extract2)
 importFrom(recipes,bake)
 importFrom(recipes,detect_step)
 importFrom(recipes,prep)
 
@@ -15,6 +15,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 ## Improvements
 
 - Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data.
+- Fix `layer_population_scaling` default `by` with `other_keys`.
+- Make key column inference more consistent within the package and with current `epiprocess`.
+- Fix `quantile_reg()` producing error when asked to output just median-level predictions.
 - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift`
 
 ## Bug fixes
 
@@ -127,11 +127,10 @@ autoplot.epi_workflow <- function(
   if (!is.null(shift)) {
     edf <- mutate(edf, time_value = time_value + shift)
   }
-  extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))
-  if (length(extra_keys) == 0L) extra_keys <- NULL
+  other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))
   edf <- as_epi_df(edf,
     as_of = object$fit$meta$as_of,
-    other_keys = extra_keys %||% character()
+    other_keys = other_keys
   )
   if (is.null(predictions)) {
     return(autoplot(
 
@@ -7,7 +7,9 @@
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom dplyr arrange across all_of any_of bind_cols bind_rows group_by
 #' @importFrom dplyr full_join relocate summarise everything
+#' @importFrom dplyr inner_join
 #' @importFrom dplyr summarize filter mutate select left_join rename ungroup
+#' @importFrom magrittr extract2
 #' @importFrom rlang := !! %||% as_function global_env set_names !!! caller_arg
 #' @importFrom rlang is_logical is_true inject enquo enquos expr sym arg_match
 #' @importFrom stats poly predict lm residuals quantile
 
@@ -1,20 +1,25 @@
 #' @export
-key_colnames.recipe <- function(x, ...) {
+key_colnames.recipe <- function(x, ..., exclude = character()) {
   geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"]
   time_key <- x$var_info$variable[x$var_info$role %in% "time_value"]
   keys <- x$var_info$variable[x$var_info$role %in% "key"]
-  c(geo_key, keys, time_key) %||% character(0L)
+  full_key <- c(geo_key, keys, time_key) %||% character(0L)
+  full_key[!full_key %in% exclude]
 }
 
 #' @export
-key_colnames.epi_workflow <- function(x, ...) {
+key_colnames.epi_workflow <- function(x, ..., exclude = character()) {
   # safer to look at the mold than the preprocessor
   mold <- hardhat::extract_mold(x)
-  molded_names <- names(mold$extras$roles)
-  geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value)
-  time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value)
-  keys <- names(mold$extras$roles[molded_names %in% "key"]$key)
-  c(geo_key, keys, time_key) %||% character(0L)
+  molded_roles <- mold$extras$roles
+  extras <- bind_cols(molded_roles$geo_value, molded_roles$key, molded_roles$time_value)
+  full_key <- names(extras)
+  if (length(full_key) == 0L) {
+    # No epikeytime role assignment; infer from all columns:
+    potential_keys <- c("geo_value", "time_value")
+    full_key <- potential_keys[potential_keys %in% names(bind_cols(molded_roles))]
+  }
+  full_key[!full_key %in% exclude]
 }
 
 kill_time_value <- function(v) {
 
@@ -19,12 +19,17 @@
 #'   inverting the existing scaling.
 #' @param by A (possibly named) character vector of variables to join by.
 #'
-#' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` produced by the `predict()` call
-#' and the user-provided dataset.
-#' If columns in that `epi_df` and `df` have the same name (and aren't
-#' included in `by`), `.df` is added to the one from the user-provided data
-#' to disambiguate.
+#' If `NULL`, the default, the function will try to infer a reasonable set of
+#' columns. First, it will try to join by all variables in the test data with
+#' roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in `df`;
+#' these roles are automatically set if you are using an `epi_df`, or you can
+#' use, e.g., `update_role`. If no such roles are set, it will try to perform a
+#' natural join, using variables in common between the training/test data and
+#' population data.
+#'
+#' If columns in the training/testing data and `df` have the same name (and
+#' aren't included in `by`), a `.df` suffix is added to the one from the
+#' user-provided data to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
 #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
@@ -135,6 +140,26 @@ slather.layer_population_scaling <-
     )
     rlang::check_dots_empty()
 
+    if (is.null(object$by)) {
+      # Assume `layer_predict` has calculated the prediction keys and other
+      # layers don't change the prediction key colnames:
+      prediction_key_colnames <- names(components$keys)
+      lhs_potential_keys <- prediction_key_colnames
+      rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col))
+      object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+      suggested_min_keys <- kill_time_value(lhs_potential_keys)
+      if (!all(suggested_min_keys %in% object$by)) {
+        cli_warn(c(
+          "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
+           but {?wasn't/weren't} found in the population `df`.",
+          "i" = "Defaulting to join by {object$by}",
+          ">" = "Double-check whether column names on the population `df` match those expected in your predictions",
+          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
+          ">" = "Manually specify `by =` to silence"
+        ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys")
+      }
+    }
+
     object$by <- object$by %||% intersect(
       epi_keys_only(components$predictions),
       colnames(select(object$df, !object$df_pop_col))
@@ -152,10 +177,12 @@ slather.layer_population_scaling <-
     suffix <- ifelse(object$create_new, object$suffix, "")
     col_to_remove <- setdiff(colnames(object$df), colnames(components$predictions))
 
-    components$predictions <- left_join(
+    components$predictions <- inner_join(
       components$predictions,
       object$df,
       by = object$by,
+      relationship = "many-to-one",
+      unmatched = c("error", "drop"),
       suffix = c("", ".df")
     ) %>%
       mutate(across(
 
@@ -112,7 +112,7 @@ make_quantile_reg <- function() {
 
     # can't make a method because object is second
     out <- switch(type,
-      rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile
+      rq = dist_quantiles(unname(as.list(x)), object$tau), # one quantile
       rqs = {
         x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x)))
         dist_quantiles(x, list(object$tau))
 
@@ -16,20 +16,25 @@
 #'   inverting the existing scaling.
 #' @param by A (possibly named) character vector of variables to join by.
 #'
-#' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` produced by the `predict()` call
-#' and the user-provided dataset.
-#' If columns in that `epi_df` and `df` have the same name (and aren't
-#' included in `by`), `.df` is added to the one from the user-provided data
-#' to disambiguate.
+#' If `NULL`, the default, the function will try to infer a reasonable set of
+#' columns. First, it will try to join by all variables in the training/test
+#' data with roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in
+#' `df`; these roles are automatically set if you are using an `epi_df`, or you
+#' can use, e.g., `update_role`. If no such roles are set, it will try to
+#' perform a natural join, using variables in common between the training/test
+#' data and population data.
+#'
+#' If columns in the training/testing data and `df` have the same name (and
+#' aren't included in `by`), a `.df` suffix is added to the one from the
+#' user-provided data to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
 #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
 #' to `df$states`. To join by multiple variables, use a vector with length > 1.
 #' For example, `by = c("geo_value" = "states", "county" = "county")` will match
 #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
 #'
-#' See [dplyr::left_join()] for more details.
+#' See [dplyr::inner_join()] for more details.
 #' @param df_pop_col the name of the column in the data frame `df` that
 #' contains the population data and will be used for scaling.
 #' This should be one column.
@@ -89,13 +94,25 @@ step_population_scaling <-
            suffix = "_scaled",
            skip = FALSE,
            id = rand_id("population_scaling")) {
-    arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, id)
-    arg_is_lgl(create_new, skip)
-    arg_is_chr(df_pop_col, suffix, id)
+    if (rlang::dots_n(...) == 0L) {
+      cli_abort(c(
+        "`...` must not be empty.",
+        ">" = "Please provide one or more tidyselect expressions in `...`
+               specifying the columns to which scaling should be applied.",
+        ">" = "If you really want to list `step_population_scaling` in your
+               recipe but not have it do anything, you can use a tidyselection
+               that selects zero variables, such as `c()`."
+      ))
+    }
+    arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, skip, id)
+    arg_is_chr(role, df_pop_col, suffix, id)
+    hardhat::validate_column_names(df, df_pop_col)
     arg_is_chr(by, allow_null = TRUE)
+    arg_is_numeric(rate_rescaling)
     if (rate_rescaling <= 0) {
       cli_abort("`rate_rescaling` must be a positive number.")
     }
+    arg_is_lgl(create_new, skip)
 
     recipes::add_step(
       recipe,
@@ -138,6 +155,42 @@ step_population_scaling_new <-
 
 #' @export
 prep.step_population_scaling <- function(x, training, info = NULL, ...) {
+  if (is.null(x$by)) {
+    rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col)
+    lhs_potential_keys <- info %>%
+      filter(role %in% c("geo_value", "key", "time_value")) %>%
+      extract2("variable") %>%
+      unique() # in case of weird var with multiple of above roles
+    if (length(lhs_potential_keys) == 0L) {
+      # We're working with a recipe and tibble, and *_role hasn't set up any of
+      # the above roles. Let's say any column could actually act as a key, and
+      # lean on `intersect` below to make this something reasonable.
+      lhs_potential_keys <- names(training)
+    }
+    suggested_min_keys <- info %>%
+      filter(role %in% c("geo_value", "key")) %>%
+      extract2("variable") %>%
+      unique()
+    # (0 suggested keys if we weren't given any epikeytime var info.)
+    x$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+    if (length(x$by) == 0L) {
+      cli_stop(c(
+        "Couldn't guess a default for `by`",
+        ">" = "Please rename columns in your population data to match those in your training data,
+               or manually specify `by =` in `step_population_scaling()`."
+      ), class = "epipredict__step_population_scaling__default_by_no_intersection")
+    }
+    if (!all(suggested_min_keys %in% x$by)) {
+      cli_warn(c(
+        "{setdiff(suggested_min_keys, x$by)} {?was an/were} epikey column{?s} in the training data,
+         but {?wasn't/weren't} found in the population `df`.",
+        "i" = "Defaulting to join by {x$by}.",
+        ">" = "Double-check whether column names on the population `df` match those for your training data.",
+        ">" = "Consider using population data with breakdowns by {suggested_min_keys}.",
+        ">" = "Manually specify `by =` to silence."
+      ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys")
+    }
+  }
   step_population_scaling_new(
     terms = x$terms,
     role = x$role,
@@ -156,10 +209,14 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
 
 #' @export
 bake.step_population_scaling <- function(object, new_data, ...) {
-  object$by <- object$by %||% intersect(
-    epi_keys_only(new_data),
-    colnames(select(object$df, !object$df_pop_col))
-  )
+  if (is.null(object$by)) {
+    cli::cli_abort(c(
+      "`by` was not set and no default was filled in",
+      ">" = "If this was a fit recipe generated from an older version
+             of epipredict that you loaded in from a file,
+             please regenerate with the current version of epipredict."
+    ))
+  }
   joinby <- list(x = names(object$by) %||% object$by, y = object$by)
   hardhat::validate_column_names(new_data, joinby$x)
   hardhat::validate_column_names(object$df, joinby$y)
@@ -177,7 +234,10 @@ bake.step_population_scaling <- function(object, new_data, ...) {
   suffix <- ifelse(object$create_new, object$suffix, "")
   col_to_remove <- setdiff(colnames(object$df), colnames(new_data))
 
-  left_join(new_data, object$df, by = object$by, suffix = c("", ".df")) %>%
+  inner_join(new_data, object$df,
+    by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"),
+    suffix = c("", ".df")
+  ) %>%
     mutate(
       across(
         all_of(object$columns),
 
@@ -359,6 +359,7 @@ drop_ignored_keys <- function(training, keys_to_ignore) {
   # note that the extra parenthesis black magic is described here: https://github.com/tidyverse/dplyr/issues/6194
   # and is needed to bypass an incomplete port of `across` functions to `if_any`
   training %>%
+    ungroup() %>%
     filter((dplyr::if_all(
       names(keys_to_ignore),
       ~ . %nin% keys_to_ignore[[cur_column()]]
Original file line number	Diff line number	Diff line change
`@@ -127,11 +127,10 @@ autoplot.epi_workflow <- function(`
`127`	`127`	`if (!is.null(shift)) {`
`128`	`128`	`edf <- mutate(edf, time_value = time_value + shift)`
`129`	`129`	`}`
`130`		`- extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))`
`131`		`- if (length(extra_keys) == 0L) extra_keys <- NULL`
	`130`	`+ other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))`
`132`	`131`	`edf <- as_epi_df(edf,`
`133`	`132`	`as_of = object$fit$meta$as_of,`
`134`		`- other_keys = extra_keys %\|\|% character()`
	`133`	`+ other_keys = other_keys`
`135`	`134`	`)`
`136`	`135`	`if (is.null(predictions)) {`
`137`	`136`	`return(autoplot(`