diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 5cf55c1b..6dbadacc 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,6 +19,9 @@ jobs: fail-fast: false matrix: config: + # The one we use in production. + - { os: ubuntu-latest, r: "renv" } + # See if the latest release works. - { os: ubuntu-latest, r: "release" } env: diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R new file mode 100644 index 00000000..b3ae0bd7 --- /dev/null +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -0,0 +1,245 @@ +#' Unormalizing transformation +#' +#' Will undo a step_epi_YeoJohnson transformation. +#' +#' @param frosting a `frosting` postprocessor. The layer will be added to the +#' sequence of operations for this frosting. +#' @param ... One or more selector functions to scale variables +#' for this step. See [recipes::selections()] for more details. +#' @param df a data frame that contains the population data to be used for +#' inverting the existing scaling. +#' @param by A (possibly named) character vector of variables to join by. +#' @param id a random id string +#' +#' @return an updated `frosting` postprocessor +#' @export +#' @examples +#' library(dplyr) +#' jhu <- epidatasets::cases_deaths_subset %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' +#' # Create a recipe with a Yeo-Johnson transformation. +#' r <- epi_recipe(jhu) %>% +#' step_epi_YeoJohnson(cases) %>% +#' step_epi_lag(cases, lag = 0) %>% +#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>% +#' step_epi_naomit() +#' +#' # Create a frosting layer that will undo the Yeo-Johnson transformation. +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_epi_YeoJohnson(.pred) +#' +#' # Create a workflow and fit it. +#' wf <- epi_workflow(r, linear_reg()) %>% +#' fit(jhu) %>% +#' add_frosting(f) +#' +#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation. +#' forecast(wf) +#' # Compare to the original data. +#' jhu %>% filter(time_value == "2021-12-31") +#' forecast(wf) +layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { + checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) + + add_layer( + frosting, + layer_epi_YeoJohnson_new( + lambdas = lambdas, + by = by, + terms = dplyr::enquos(...), + id = id + ) + ) +} + +layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { + epipredict:::layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) +} + +#' @export +#' @importFrom workflows extract_preprocessor +slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { + rlang::check_dots_empty() + + # Get the lambdas from the layer or from the workflow. + lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow) + + # If the by is not specified, try to infer it from the lambdas. + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_"))) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") + if (!all(suggested_min_keys %in% object$by)) { + cli_warn( + c( + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + } + } + + # Establish the join columns. + object$by <- object$by %||% + intersect( + epipredict:::epi_keys_only(components$predictions), + colnames(select(lambdas, -starts_with(".lambda_"))) + ) + joinby <- list(x = names(object$by) %||% object$by, y = object$by) + hardhat::validate_column_names(components$predictions, joinby$x) + hardhat::validate_column_names(lambdas, joinby$y) + + # Join the lambdas. + components$predictions <- inner_join( + components$predictions, + lambdas, + by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop") + ) + + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + + # The `object$terms` is where the user specifies the columns they want to + # untransform. We need to match the outcomes with their lambda columns in our + # parameter table and then apply the inverse transformation. + if (identical(col_names, ".pred")) { + # In this case, we don't get a hint for the outcome column name, so we need + # to infer it from the mold. + if (length(components$mold$outcomes) > 1) { + cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) + } + # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. + # We want to extract the cases part. + outcome_cols <- names(components$mold$outcomes) %>% + stringr::str_match("ahead_\\d+_(.*)") %>% + magrittr::extract(, 2) + + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols)))) + } else if (identical(col_names, character(0))) { + # Wish I could suggest `all_outcomes()` here, but currently it's the same as + # not specifying any terms. I don't want to spend time with dealing with + # this case until someone asks for it. + cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented. + If you had a single outcome, you can use `.pred` as a column name. + If you had multiple outcomes, you'll need to specify them like + `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", call = rlang::caller_env()) + } else { + # In this case, we assume that the user has specified the columns they want + # transformed here. We then need to determine the lambda columns for each of + # these columns. That is, we need to convert a vector of column names like + # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to + # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). + original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] + outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2] + if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { + cli_abort("All columns specified in `...` must be outcome columns. + They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", call = rlang::caller_env()) + } + + for (i in seq_along(col_names)) { + col <- col_names[i] + lambda_col <- paste0(".lambda_", original_outcome_cols[i]) + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) + } + } + + # Remove the lambda columns. + components$predictions <- components$predictions %>% + select(-any_of(starts_with(".lambda_"))) %>% + ungroup() + components +} + +#' @export +print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + epipredict:::print_layer(x$terms, title = title, width = width) +} + +#' Inverse Yeo-Johnson transformation +#' +#' Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is +#' vectorized in x, but not in lambda. +#' +#' @keywords internal +yj_inverse <- function(x, lambda, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_inv_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + # log(x + 1) + exp(x) - 1 + } else { + # ((x + 1)^lambda - 1) / lambda + (lambda * x + 1)^(1 / lambda) - 1 + } + } + + ng_inv_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + # -log(-x + 1) + -(exp(-x) - 1) + } else { + # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_inv_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_inv_trans(x[is_neg], lambda) + } + x +} + +get_lambdas_in_layer <- function(workflow) { + this_recipe <- hardhat::extract_recipe(workflow) + if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { + cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + } + for (step in this_recipe$steps) { + if (inherits(step, "step_epi_YeoJohnson")) { + lambdas <- step$lambdas + break + } + } + lambdas +} diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R new file mode 100644 index 00000000..ef83e172 --- /dev/null +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -0,0 +1,400 @@ +#' Yeo-Johnson transformation +#' +#' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will +#' transform data using a Yeo-Johnson transformation. This fork works with panel +#' data and is meant for epidata. +#' +#' @inheritParams step_center +#' @param lambdas Internal. A numeric vector of transformation values. This +#' is `NULL` until computed by [prep()]. +#' @param na_lambda_fill A numeric value to fill in for any +#' geos where the lambda cannot be estimated. +#' @param limits A length 2 numeric vector defining the range to +#' compute the transformation parameter lambda. +#' @param num_unique An integer where data that have fewer than this +#' many unique values will not be evaluated for a transformation. +#' @param na_rm A logical indicating whether missing values should be +#' removed. +#' @param skip A logical. Should the step be skipped when the recipe is +#' baked by [bake()]. On the `training` data, the step will always be +#' conducted (even if `skip = TRUE`). +#' @template step-return +#' @family individual transformation steps +#' @export +#' @details The Yeo-Johnson transformation is variance-stabilizing +#' transformation, similar to the Box-Cox but does not require the input +#' variables to be strictly positive. In the package, the partial +#' log-likelihood function is directly optimized within a reasonable set of +#' transformation values (which can be changed by the user). The optimization +#' finds a lambda parameter for each group in the data that minimizes the +#' variance of the transformed data. +#' +#' This transformation is typically done on the outcome variable +#' using the residuals for a statistical model (such as ordinary +#' least squares). Here, a simple null model (intercept only) is +#' used to apply the transformation to the *predictor* +#' variables individually. This can have the effect of making the +#' variable distributions more symmetric. +#' +#' If the transformation parameters are estimated to be very +#' close to the bounds, or if the optimization fails, a value of +#' `NA` is used and no transformation is applied. +#' +#' # Tidying +#' +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with +#' columns `terms`, `value` , and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected} +#' \item{value}{numeric, the lambda estimate} +#' \item{id}{character, id of this step} +#' } +#' +#' @template case-weights-not-supported +#' +#' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power +#' transformations to improve normality or symmetry. *Biometrika*. +#' @examples +#' jhu <- cases_deaths_subset %>% +#' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' filtered_data <- jhu +#' +#' r <- epi_recipe(filtered_data) %>% +#' step_epi_YeoJohnson(cases) +#' # View the recipe +#' r +#' # Fit the recipe +#' tr <- r %>% prep(filtered_data) +#' # View the lambda values +#' tr$steps[[1]]$lambdas +#' # View the transformed data +#' df <- tr %>% bake(filtered_data) +#' plot(density(df$cases)) +#' plot(density(filtered_data$cases)) +step_epi_YeoJohnson <- function( + recipe, + ..., + role = "predictor", + trained = FALSE, + lambdas = NULL, + na_lambda_fill = 1 / 4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson") +) { + checkmate::assert_numeric(limits, len = 2) + checkmate::assert_numeric(na_lambda_fill, lower = min(limits), upper = max(limits), len = 1) + checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) + checkmate::assert_logical(na_rm, len = 1) + checkmate::assert_logical(skip, len = 1) + add_step( + recipe, + step_epi_YeoJohnson_new( + terms = enquos(...), + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = sort(limits)[1:2], + num_unique = num_unique, + na_rm = na_rm, + forecast_date = NULL, + metadata = NULL, + columns = NULL, + skip = skip, + id = id + ) + ) +} + +step_epi_YeoJohnson_new <- function( + terms, + role, + trained, + lambdas, + na_lambda_fill, + limits, + num_unique, + na_rm, + forecast_date, + metadata, + columns, + skip, + id +) { + step( + subclass = "epi_YeoJohnson", + terms = terms, + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = limits, + num_unique = num_unique, + na_rm = na_rm, + forecast_date = forecast_date, + metadata = metadata, + columns = columns, + skip = skip, + id = id + ) +} + +#' @export +prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { + # Check that the columns selected for transformation are numeric. + col_names <- recipes_eval_select(x$terms, training, info) + recipes::check_type(training[, col_names], types = c("double", "integer")) + + lambdas <- get_lambdas_yj_table( + training, + col_names, + x$limits, + x$num_unique, + x$na_lambda_fill, + x$na_rm, + key_colnames(training, exclude = "time_value") + ) + + step_epi_YeoJohnson_new( + terms = x$terms, + role = x$role, + trained = TRUE, + lambdas = lambdas, + na_lambda_fill = x$na_lambda_fill, + limits = x$limits, + num_unique = x$num_unique, + na_rm = x$na_rm, + forecast_date = attributes(training)$metadata$as_of, + metadata = attributes(training)$metadata, + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.step_epi_YeoJohnson <- function(object, new_data, ...) { + # If not an epi_df, make it one assuming the template of training data. + # If it is an epi_df, check that the keys match. + # Imitating the pattern in step_adjust_latency(). + if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { + new_data <- as_epi_df( + new_data, + as_of = object$forecast_date, + other_keys = object$metadata$other_keys %||% character() + ) + new_data %@% metadata <- object$metadata + } + # Check that the keys match. + keys <- key_colnames(new_data, exclude = "time_value") + old_keys <- object$lambdas %>% select(-starts_with(".lambda_")) %>% colnames() + if (!all(keys %in% old_keys)) { + cli::cli_abort( + "The keys of the new data do not match the keys of the training data.", + call = rlang::caller_fn() + ) + } + # Check that the columns for transformation are present in new_data. + col_names <- object$columns + check_new_data(col_names, object, new_data) + + # Transform each column, using the appropriate lambda column per row. + # Note that yj_transform() is vectorized in x, but not in lambda. + new_data <- left_join(new_data, object$lambdas, by = keys) + for (col in col_names) { + new_data <- new_data %>% + rowwise() %>% + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".lambda_", col)))) + } + # Remove the lambda columns. + new_data %>% + select(-starts_with(".lambda_")) %>% + ungroup() +} + +#' @export +print.step_epi_YeoJohnson <- function(x, width = max(20, options()$width - 39), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + epipredict:::print_epi_step(x$terms, x$terms, title = title, width = width) + invisible(x) +} + +#' Compute the lambda values per group for each column. +#' +#' @keywords internal +#' @rdname recipes-internal +get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lambda_fill, na_rm, epi_keys_checked) { + # Estimate the lambda for each column, creating a lambda_ column for each. + # Note that estimate_yj() operates on a vector. + lambdas <- training %>% + summarise( + across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), + .by = all_of(epi_keys_checked) + ) %>% + rename_with(~ paste0(".lambda_", .x), -all_of(epi_keys_checked)) + + # Check for NAs in any of the lambda_ columns. + # EDIT: This warning was too noisy. Keeping code around, in case we want it. + # for (col in col_names) { + # if (any(is.na(values[[paste0("lambda_", col)]]))) { + # cli::cli_warn( + # c( + # x = "Yeo-Johnson lambda could not be estimated for some geos for {col}.", + # i = "Using lambda={x$na_lambda_fill} in these cases." + # ), + # call = rlang::caller_fn() + # ) + # } + # } + + # Fill in NAs with the default lambda. + lambdas %>% + mutate(across(starts_with(".lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) +} + + +### Code below taken from recipes::step_YeoJohnson. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 + +#' Internal Functions +#' +#' Note that this function is vectorized in x, but not in lambda. +#' +#' @keywords internal +#' @rdname recipes-internal +#' @export +yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + # TODO case weights: can we use weights here? + if (is.null(ind_neg)) { + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + } + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + log(x + 1) + } else { + ((x + 1)^lambda - 1) / lambda + } + } + + ng_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + -log(-x + 1) + } else { + -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_trans(x[is_neg], lambda) + } + x +} + +## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., +## & Johnson, R. A. (2000). A new family of power transformations +## to improve normality or symmetry. Biometrika. page 957 +ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) { + n <- length(y) + y_t <- yj_transform(y, lambda, ind_neg) + # EDIT: Unused in the original recipes code. + # mu_t <- mean(y_t) + var_t <- var(y_t) * (n - 1) / n + res <- -.5 * n * log(var_t) + (lambda - 1) * const + res +} + +## eliminates missing data and returns -llh +yj_obj <- function(lam, dat, ind_neg, const) { + ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const) +} + +## estimates the values +#' @keywords internal +#' @rdname recipes-internal +#' @export +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { + na_rows <- which(is.na(dat)) + if (length(na_rows) > 0) { + if (na_rm) { + dat <- dat[-na_rows] + } else { + cli::cli_abort( + c( + x = "Missing values are not allowed for the YJ transformation.", + i = "See {.arg na_rm} option." + ), + call = call + ) + } + } + + eps <- .001 + if (length(unique(dat)) < num_unique) { + return(NA) + } + dat_neg <- dat < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + + const <- sum(sign(dat) * log(abs(dat) + 1)) + + suppressWarnings( + res <- optimize( + yj_obj, + interval = limits, + maximum = TRUE, + dat = dat, + ind_neg = ind_neg, + const = const, + tol = .0001 + ) + ) + lam <- res$maximum + if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) { + lam <- NA + } + lam +} + +# Copied from recipes:::tidy.step_BoxCox +# +#' @rdname tidy.recipe +#' @export +tidy.step_epi_YeoJohnson <- function(x, ...) { + if (is_trained(x)) { + res <- tibble( + terms = names(x$lambdas), + value = unname(x$lambdas) + ) + } else { + term_names <- sel2char(x$terms) + res <- tibble( + terms = term_names, + value = na_dbl + ) + } + res$id <- x$id + res +} diff --git a/_targets.yaml b/_targets.yaml index 3f31a6e6..c76259e4 100644 --- a/_targets.yaml +++ b/_targets.yaml @@ -18,4 +18,8 @@ covid_hosp_prod: store: covid_hosp_prod use_crew: yes reporter_make: timestamp - +# test_proj: +# script: scripts/test_proj.R +# store: test_proj +# use_crew: yes +# reporter_make: timestamp diff --git a/test-yeo-johnson.Rmd b/test-yeo-johnson.Rmd new file mode 100644 index 00000000..27379ad7 --- /dev/null +++ b/test-yeo-johnson.Rmd @@ -0,0 +1,88 @@ +--- +title: "Yeo-Johnson Transformation Testing" +output: + html_document: + self_contained: True +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + digits = 3, + comment = "#>", + collapse = TRUE, + cache = FALSE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) +suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) +``` + +## Setup and Data Loading + +First, we'll set up the environment and load the necessary data: + +```{r setup-env} +# Simple case with keys = geo_value. +filtered_data <- cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) +``` + +## Yeo-Johnson Transformation + +Let's apply the Yeo-Johnson transformation to our data: + +```{r yeo-johnson-transform} +r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + prep(filtered_data) + +# Display the recipe +r + +# Inspect the lambda values for each state +r$steps[[1]]$lambdas +``` + +## Manual Whitening Comparison + +Now, let's compare the Yeo-Johnson transformation with a manual whitening approach using quarter root scaling: + +```{r manual-whitening} +# Apply the transformation +out1 <- r %>% bake(filtered_data) +out2 <- filtered_data %>% + mutate(cases = (cases + 0.01)^(1 / 4)) + +all_together <- rbind( + filtered_data %>% + mutate(name = "raw"), + out1 %>% mutate(name = "yeo-johnson"), + out2 %>% mutate(name = "quarter-root") +) + +all_together %>% + ggplot(aes(time_value, cases, color = name)) + + geom_line() + + facet_grid(~geo_value, scales = "free_y") + + theme_minimal() + + labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases") + + scale_y_log10() +``` diff --git a/tests/testthat/_snaps/forecasters-basics.md b/tests/testthat/_snaps/forecasters-basics.md index d82ac804..15216ea7 100644 --- a/tests/testthat/_snaps/forecasters-basics.md +++ b/tests/testthat/_snaps/forecasters-basics.md @@ -17,8 +17,3 @@ ! Can't rename columns that don't exist. x Column `slide_value_case_rate` doesn't exist. -# no_recent_outcome deals with no as_of - - Code - res <- forecaster[[2]](jhu, "case_rate", extra_sources = "death_rate", ahead = 2L) - diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R new file mode 100644 index 00000000..28ca7b72 --- /dev/null +++ b/tests/testthat/test-yeo-johnson.R @@ -0,0 +1,132 @@ +suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) + +test_that("Yeo-Johnson transformation inverts correctly", { + # Note that the special lambda values of 0 and 2 are covered by the tests + # below. + expect_true( + map_lgl(seq(-5, 5, 0.1), function(lambda) { + map_lgl(seq(-10, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() + }) %>% + all() + ) +}) + +test_that("Yeo-Johnson steps and layers invert each other", { + jhu <- cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true(".lambda_cases" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_cases)) + # Still works on a tibble + expect_equal( + tr %>% bake(filtered_data %>% as_tibble()), + tr %>% bake(filtered_data) + ) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(cases = .pred) + expect_equal(out1, out2) + + # Make sure it works when there are multiple predictors and outcomes + jhu_multi <- epidatasets::covid_case_death_rates_extended %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, case_rate, death_rate) + filtered_data <- jhu_multi + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(case_rate, death_rate) %>% + step_epi_lag(case_rate, death_rate, lag = 0) %>% + step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true(".lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(".lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_death_rate)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) + # debugonce(slather.layer_epi_YeoJohnson) + out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { + # Small synthetic grad_employ_dataset version. + filtered_data <- tribble( + ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y, + "ca", "25-34", "bachelor", 2017, 50000, + "ca", "25-34", "bachelor", 2018, 50500, + "ca", "25-34", "bachelor", 2019, 51000, + "ca", "25-34", "bachelor", 2020, 51500, + "ca", "25-34", "bachelor", 2021, 52000, + "ca", "25-34", "bachelor", 2022, 52500, + "ca", "35-1000", "bachelor", 2017, 3e10, + "ca", "35-1000", "bachelor", 2018, 3e10 + 10, + "ca", "35-1000", "bachelor", 2019, 3e10 + 20, + "ca", "35-1000", "bachelor", 2020, 3e10 + 30, + "ca", "35-1000", "bachelor", 2021, 3e10 + 40, + "ca", "35-1000", "bachelor", 2022, 3e10 + 50, + "ca", "25-34", "master", 2017, 2 * 50000, + "ca", "25-34", "master", 2018, 2 * 50500, + "ca", "25-34", "master", 2019, 2 * 51000, + "ca", "25-34", "master", 2020, 2 * 51500, + "ca", "25-34", "master", 2021, 2 * 52000, + "ca", "25-34", "master", 2022, 2 * 52500, + "ca", "35-1000", "master", 2017, 2 * 3e10, + "ca", "35-1000", "master", 2018, 2 * (3e10 + 10), + "ca", "35-1000", "master", 2019, 2 * (3e10 + 20), + "ca", "35-1000", "master", 2020, 2 * (3e10 + 30), + "ca", "35-1000", "master", 2021, 2 * (3e10 + 40), + "ca", "35-1000", "master", 2022, 2 * (3e10 + 50) + ) %>% as_epi_df(other_keys = c("age_group", "edu_qual")) + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(med_income_2y) %>% + step_epi_lag(med_income_2y, lag = 0) %>% + step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas)) + expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas)) + expect_true("age_group" %in% names(tr$steps[[1]]$lambdas)) + expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) + out2 <- forecast(wf) %>% rename(med_income_2y = .pred) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) + expect_equal(out1, out2) +})