diff --git a/NAMESPACE b/NAMESPACE index ced7195b..f14e27db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,10 +92,22 @@ importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,caller_arg) +importFrom(rlang,caller_env) +importFrom(rlang,check_dots_empty0) importFrom(rlang,enquo) importFrom(rlang,enquos) +importFrom(rlang,f_env) +importFrom(rlang,f_rhs) +importFrom(rlang,global_env) +importFrom(rlang,is_environment) +importFrom(rlang,is_formula) +importFrom(rlang,is_function) importFrom(rlang,is_missing) importFrom(rlang,is_quosure) +importFrom(rlang,is_string) +importFrom(rlang,missing_arg) +importFrom(rlang,new_function) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) diff --git a/NEWS.md b/NEWS.md index aea09f44..0af73682 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,16 @@ inter-release development versions will include an additional ".9999" suffix. ## Breaking changes: +* Changes to `epix_slide`: + * The `f` computation is now required to take at least three arguments. `f` + must take an `epi_df` with the same column names as the archive's `DT`, + minus the `version` column; followed by a one-row tibble containing the + values of the grouping variables for the associated group; followed by a + reference time value, usually as a `Date` object; followed by any number + of named arguments. + +## New features: + * `epix_slide` has been made more like `dplyr::group_modify`. It will no longer perform element/row recycling for size stability, accepts slide computation outputs containing any number of rows, and no longer supports `all_rows`. @@ -19,6 +29,11 @@ inter-release development versions will include an additional ".9999" suffix. more closely whether/when/how to output an `epi_df`. * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` when desired and set the metadata appropriately. +* `epix_slide` `f` computations passed as functions or formulas now have + access to the reference time value. If `f` is a function, it is passed a + Date containing the reference time value as the third argument. If a + formula, `f` can access the reference time value via `.z` or + `.ref_time_value`. ## Improvements: diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index b17bcd98..b03dc8c1 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -232,7 +232,7 @@ grouped_epi_archive = # Check that `f` takes enough args if (!missing(f) && is.function(f)) { - assert_sufficient_f_args(f, ...) + assert_sufficient_f_args(f, ..., n_mandatory_f_args = 3L) } # Validate and pre-process `before`: @@ -272,7 +272,7 @@ grouped_epi_archive = ref_time_value, new_col) { # Carry out the specified computation - comp_value = f(.data_group, .group_key, ...) + comp_value = f(.data_group, .group_key, ref_time_value, ...) if (all_versions) { # Extract data from archive so we can do length checks below. When @@ -298,7 +298,7 @@ grouped_epi_archive = # If f is not missing, then just go ahead, slide by group if (!missing(f)) { - if (rlang::is_formula(f)) f = rlang::as_function(f) + if (rlang::is_formula(f)) f = as_slide_computation(f) x = purrr::map_dfr(ref_time_values, function(ref_time_value) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 4db7e45d..d04e30d3 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -665,14 +665,17 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' sliding (a.k.a. "rolling") time window for each data group. The window is #' determined by the `before` parameter described below. One time step is #' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take `x`, an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed by -#' `g`, a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by any number of named arguments. If a -#' formula, `f` can operate directly on columns accessed via `.x$var`, as in -#' `~ mean(.x$var)` to compute a mean of a column `var` for each -#' `ref_time_value`-group combination. If `f` is missing, then `...` will -#' specify the computation. +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an #' expression for tidy evaluation. See details of [`epi_slide`]. @@ -827,7 +830,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' function(x, g) { +#' function(x, gk, rtv) { #' tibble( #' time_range = if(nrow(x) == 0L) { #' "0 `time_value`s" @@ -855,7 +858,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' function(x, g) { +#' function(x, gk, rtv) { #' tibble( #' versions_start = if (nrow(x$DT) == 0L) { #' "NA (0 rows)" diff --git a/R/slide.R b/R/slide.R index ab591f99..c459f21e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -12,15 +12,15 @@ #' sliding (a.k.a. "rolling") time window for each data group. The window is #' determined by the `before` and `after` parameters described below. One time #' step is typically one day or one week; see details for more explanation. If -#' a function, `f` must take `x`, a data frame with the same column names as +#' a function, `f` must take a data frame with the same column names as #' the original object, minus any grouping variables, containing the time -#' window data for one `ref_time_value`-group combination; followed by `g`, a +#' window data for one group-`ref_time_value` combination; followed by a #' one-row tibble containing the values of the grouping variables for the #' associated group; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var`, as in `~ -#' mean(.x$var)` to compute a mean of a column `var` for each -#' `ref_time_value`-group combination. If `f` is missing, then `...` will -#' specify the computation. +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~mean(.x$var)` to compute a mean of a column `var` for each +#' `ref_time_value`-group combination. The group key can be accessed via `.y`. +#' If `f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `f` is missing, then the `...` is interpreted as #' an expression for tidy evaluation. See details. diff --git a/R/utils.R b/R/utils.R index 349c173a..e4625a4f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -103,17 +103,19 @@ Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) #' Assert that a sliding computation function takes enough args #' #' @param f Function; specifies a computation to slide over an `epi_df` or -#' `epi_archive` in `epi_slide` or `epix_slide`. +#' `epi_archive` in `epi_slide` or `epix_slide`. #' @param ... Dots that will be forwarded to `f` from the dots of `epi_slide` or #' `epix_slide`. +#' @param n_mandatory_f_args Integer; specifies the number of arguments `f` +#' is required to take before any `...` arg. Defaults to 2. #' #' @importFrom rlang is_missing #' @importFrom purrr map_lgl #' @importFrom utils tail #' #' @noRd -assert_sufficient_f_args <- function(f, ...) { - mandatory_f_args_labels <- c("window data", "group key") +assert_sufficient_f_args <- function(f, ..., n_mandatory_f_args = 2L) { + mandatory_f_args_labels <- c("window data", "group key", "reference time value")[seq(n_mandatory_f_args)] n_mandatory_f_args <- length(mandatory_f_args_labels) args = formals(args(f)) args_names = names(args) @@ -181,6 +183,109 @@ assert_sufficient_f_args <- function(f, ...) { } } +#' Convert to function +#' +#' @description +#' `as_slide_computation()` transforms a one-sided formula into a function. +#' This powers the lambda syntax in packages like purrr. +#' +#' This code and documentation borrows heavily from [`rlang::as_function`] +#' (https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427). +#' +#' This code extends `rlang::as_function` to create functions that take three +#' arguments. The arguments can be accessed via the idiomatic `.x`, `.y`, +#' etc, positional references (`..1`, `..2`, etc), and also by `epi +#' [x]_slide`-specific names. +#' +#' @source https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427 +#' +#' @param x A function or formula. +#' +#' If a **function**, it is used as is. +#' +#' If a **formula**, e.g. `~ mean(.x$cases)`, it is converted to a function with up +#' to three arguments: `.x` (single argument), or `.x` and `.y` +#' (two arguments), or `.x`, `.y`, and `.z` (three arguments). The `.` +#' placeholder can be used instead of `.x`, `.group_key` can be used in +#' place of `.y`, and `.ref_time_value` can be used in place of `.z`. This +#' allows you to create very compact anonymous functions (lambdas) with up +#' to three inputs. Functions created from formulas have a special class. Use +#' `rlang::is_lambda()` to test for it. +#' +#' If a **string**, the function is looked up in `env`. Note that +#' this interface is strictly for user convenience because of the +#' scoping issues involved. Package developers should avoid +#' supplying functions by name and instead supply them by value. +#' +#' @param env Environment in which to fetch the function in case `x` +#' is a string. +#' @inheritParams rlang::args_dots_empty +#' @inheritParams rlang::args_error_context +#' @examples +#' f <- as_slide_computation(~ .x + 1) +#' f(10) +#' +#' g <- as_slide_computation(~ -1 * .) +#' g(4) +#' +#' h <- as_slide_computation(~ .x - .group_key) +#' h(6, 3) +#' +#' @importFrom rlang check_dots_empty0 is_function new_function f_env +#' is_environment missing_arg f_rhs is_string is_formula caller_arg +#' caller_env global_env +#' +#' @noRd +as_slide_computation <- function(x, + env = global_env(), + ..., + arg = caller_arg(x), + call = caller_env()) { + check_dots_empty0(...) + + if (is_function(x)) { + return(x) + } + + if (is_formula(x)) { + if (length(x) > 2) { + Abort(sprintf("%s must be a one-sided formula", arg), + class = "epiprocess__as_slide_computation__formula_is_twosided", + epiprocess__x = x, + call = call) + } + + env <- f_env(x) + if (!is_environment(env)) { + Abort("Formula must carry an environment.", + class = "epiprocess__as_slide_computation__formula_has_no_env", + epiprocess__x = x, + epiprocess__x_env = env, + arg = arg, call = call) + } + + args <- list( + ... = missing_arg(), + .x = quote(..1), .y = quote(..2), .z = quote(..3), + . = quote(..1), .group_key = quote(..2), .ref_time_value = quote(..3) + ) + fn <- new_function(args, f_rhs(x), env) + fn <- structure(fn, class = c("epiprocess_slide_computation", "function")) + return(fn) + } + + if (is_string(x)) { + return(get(x, envir = env, mode = "function")) + } + + Abort(sprintf("Can't convert a %s to a slide computation", class(x)), + class = "epiprocess__as_slide_computation__cant_convert_catchall", + epiprocess__x = x, + epiprocess__x_class = class(x), + arg = arg, + call = call) +} + ########## in_range = function(x, rng) pmin(pmax(x, rng[1]), rng[2]) diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 2a646670..cc95fa5f 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -28,14 +28,15 @@ computation to slide. To "slide" means to apply a computation within a sliding (a.k.a. "rolling") time window for each data group. The window is determined by the \code{before} and \code{after} parameters described below. One time step is typically one day or one week; see details for more explanation. If -a function, \code{f} must take \code{x}, a data frame with the same column names as +a function, \code{f} must take a data frame with the same column names as the original object, minus any grouping variables, containing the time -window data for one \code{ref_time_value}-group combination; followed by \code{g}, a +window data for one group-\code{ref_time_value} combination; followed by a one-row tibble containing the values of the grouping variables for the associated group; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var}, as in \code{~ mean(.x$var)} to compute a mean of a column \code{var} for each -\code{ref_time_value}-group combination. If \code{f} is missing, then \code{...} will -specify the computation.} +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each +\code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. +If \code{f} is missing, then \code{...} will specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified via \code{f}. Alternatively, if \code{f} is missing, then the \code{...} is interpreted as diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index e9b755b9..fd2a2646 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -26,14 +26,17 @@ computation to slide. To "slide" means to apply a computation over a sliding (a.k.a. "rolling") time window for each data group. The window is determined by the \code{before} parameter described below. One time step is typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take \code{x}, an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed by -\code{g}, a one-row tibble containing the values of the grouping variables for -the associated group; followed by any number of named arguments. If a -formula, \code{f} can operate directly on columns accessed via \code{.x$var}, as in -\code{~ mean(.x$var)} to compute a mean of a column \code{var} for each -\code{ref_time_value}-group combination. If \code{f} is missing, then \code{...} will -specify the computation.} +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} \item{...}{Additional arguments to pass to the function or formula specified via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an @@ -209,7 +212,7 @@ archive_cases_dv_subset \%>\% archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - function(x, g) { + function(x, gk, rtv) { tibble( time_range = if(nrow(x) == 0L) { "0 `time_value`s" @@ -237,7 +240,7 @@ archive_cases_dv_subset \%>\% archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - function(x, g) { + function(x, gk, rtv) { tibble( versions_start = if (nrow(x$DT) == 0L) { "NA (0 rows)" diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 34fef705..1ae018d0 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,6 +1,6 @@ library(dplyr) -test_that("epix_slide only works on an epi_archive",{ +test_that("epix_slide only works on an epi_archive", { expect_error(epix_slide(data.frame(x=1))) }) @@ -14,7 +14,7 @@ x <- tibble::tribble(~version, ~time_value, ~binary, xx <- bind_cols(geo_value = rep("x",15), x) %>% as_epi_archive() -test_that("epix_slide works as intended",{ +test_that("epix_slide works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide(f = ~ sum(.x$binary), @@ -39,12 +39,12 @@ test_that("epix_slide works as intended",{ new_col_name = 'sum_binary') ) - expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + expect_identical(xx1,xx3) # This and * imply xx2 and xx3 are identical # function interface xx4 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = function(x, g) { + epix_slide(f = function(x, gk, rtv) { tibble::tibble(sum_binary = sum(x$binary)) }, before = 2, names_sep = NULL) @@ -304,7 +304,7 @@ test_that("epix_slide with all_versions option has access to all older versions" # `waldo` package: testthat::local_edition(3) - slide_fn <- function(x, g) { + slide_fn <- function(x, gk, rtv) { return(tibble(n_versions = length(unique(x$DT$version)), n_row = nrow(x$DT), dt_class1 = class(x$DT)[[1L]], @@ -376,7 +376,7 @@ test_that("as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: - f1 = function(x, g) { + f1 = function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) @@ -390,11 +390,11 @@ test_that("as_of and epix_slide with long enough window are compatible", { # For all_versions = TRUE: - f2 = function(x, g) { + f2 = function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: epix_slide( - function(subx, subg) { + function(subx, subgk, rtv) { tibble(data = list( subx %>% filter(time_value == attr(subx, "metadata")$as_of - 1) %>% @@ -434,7 +434,7 @@ test_that("as_of and epix_slide with long enough window are compatible", { }) test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ - slide_fn <- function(x, g) { + slide_fn <- function(x, gk, rtv) { expect_true(is_epi_archive(x)) return(NA) } @@ -447,7 +447,7 @@ test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_version all_versions = TRUE) }) -test_that("epix_slide with all_versions option works as intended",{ +test_that("epix_slide with all_versions option works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide(f = ~ sum(.x$DT$binary), @@ -500,7 +500,7 @@ test_that("epix_slide with all_versions option works as intended",{ test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty = function(ea, ...) { ea %>% - epix_slide(before = 5L, ..., function(x, g) { + epix_slide(before = 5L, ..., function(x, gk, rtv) { tibble::tibble() }) } @@ -563,16 +563,66 @@ test_that("epix_slide works with 0-row computation outputs", { # }) test_that("epix_slide alerts if the provided f doesn't take enough args", { - f_xg = function(x, g) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide(xx, f = f_xg, before = 2L), regexp = NA) - expect_warning(epix_slide(xx, f = f_xg, before = 2L), regexp = NA) + expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) expect_warning(epix_slide(xx, f_x_dots, before = 2L), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") }) +test_that("epix_slide computation can use ref_time_value", { + # Formula + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .ref_time_value, + before = 2) + + xx_ref <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = c(4,5,6,7) + ) %>% + group_by(geo_value) + + expect_identical(xx1,xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .z, + before = 2) + + expect_identical(xx2,xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ ..3, + before = 2) + + expect_identical(xx3,xx_ref) + + # Function + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = function(x, g, t) t, + before = 2) + + expect_identical(xx4,xx_ref) + + # Dots + expect_error(xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + slide_value = ref_time_value), + "object 'ref_time_value' not found") +expect_error(xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + slide_value = .env$ref_time_value), + "object 'ref_time_value' not found") +}) + test_that("`epix_slide` doesn't decay date output", { expect_true( xx$DT %>% diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 6648ce3c..4b1c38d2 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -184,3 +184,36 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th regexp = "window data and group key to `f`'s x and setting argument", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") }) + +test_that("computation formula-derived functions take all argument types", { + # positional + expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) + expect_identical(as_slide_computation(~ ..1)(1, 2, 3), 1) + # Matching rlang, purr, dplyr usage + expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) + expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) + # named + expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) + expect_identical(as_slide_computation(~ .group_key)(1, 2, 3), 2) +}) + +test_that("as_slide_computation passes functions unaltered", { + f <- function(a, b, c) {a * b * c + 5} + expect_identical(as_slide_computation(f), f) +}) + +test_that("as_slide_computation raises errors as expected", { + # Formulas must be one-sided + expect_error(as_slide_computation(y ~ ..1), + class="epiprocess__as_slide_computation__formula_is_twosided") + + # `f_env` must be an environment + formula_without_env <- stats::as.formula(~ ..1) + rlang::f_env(formula_without_env) <- 5 + expect_error(as_slide_computation(formula_without_env), + class="epiprocess__as_slide_computation__formula_has_no_env") + + # `f` must be a function, formula, or string + expect_error(as_slide_computation(5), + class="epiprocess__as_slide_computation__cant_convert_catchall") +})