diff --git a/NAMESPACE b/NAMESPACE index ff1c15c5..1d8affef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,6 +69,7 @@ importFrom(data.table,key) importFrom(data.table,set) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,dplyr_col_modify) importFrom(dplyr,dplyr_reconstruct) importFrom(dplyr,dplyr_row_slice) @@ -76,6 +77,7 @@ importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) importFrom(dplyr,group_modify) +importFrom(dplyr,group_vars) importFrom(dplyr,groups) importFrom(dplyr,mutate) importFrom(dplyr,relocate) diff --git a/R/slide.R b/R/slide.R index c459f21e..635d4d3d 100644 --- a/R/slide.R +++ b/R/slide.R @@ -23,7 +23,10 @@ #' If `f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `f` is missing, then the `...` is interpreted as -#' an expression for tidy evaluation. See details. +#' an expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to `.x`, `.group_key`, and +#' `.ref_time_value`. See details. #' @param before,after How far `before` and `after` each `ref_time_value` should #' the sliding window extend? At least one of these two arguments must be #' provided; the other's default will be 0. Any value provided for either @@ -119,7 +122,8 @@ #' through the `new_col_name` argument. #' #' @importFrom lubridate days weeks -#' @importFrom rlang .data .env !! enquo enquos sym +#' @importFrom dplyr bind_rows group_vars filter select +#' @importFrom rlang .data .env !! enquo enquos sym env #' @export #' @examples #' # slide a 7-day trailing average formula on cases @@ -166,11 +170,8 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # Check that `f` takes enough args if (!missing(f) && is.function(f)) { - assert_sufficient_f_args(f, ...) + assert_sufficient_f_args(f, ..., n_mandatory_f_args = 3L) } - - # Arrange by increasing time_value - x = arrange(x, time_value) if (missing(ref_time_values)) { ref_time_values = unique(x$time_value) @@ -231,6 +232,35 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, after <- time_step(after) } + min_ref_time_values = ref_time_values - before + min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] + + # Do set up to let us recover `ref_time_value`s later. + # A helper column marking real observations. + x$.real = TRUE + + # Create df containing phony data. Df has the same columns and attributes as + # `x`, but filled with `NA`s aside from grouping columns. Number of rows is + # equal to the number of `min_ref_time_values_not_in_x` we have * the + # number of unique levels seen in the grouping columns. + before_time_values_df = data.frame(time_value=min_ref_time_values_not_in_x) + if (length(group_vars(x)) != 0) { + before_time_values_df = dplyr::cross_join( + # Get unique combinations of grouping columns seen in real data. + unique(x[, group_vars(x)]), + before_time_values_df + ) + } + # Automatically fill in all other columns from `x` with `NA`s, and carry + # attributes over to new df. + before_time_values_df <- bind_rows(x[0,], before_time_values_df) + before_time_values_df$.real <- FALSE + + x <- bind_rows(before_time_values_df, x) + + # Arrange by increasing time_value + x = arrange(x, time_value) + # Now set up starts and stops for sliding/hopping time_range = range(unique(x$time_value)) starts = in_range(ref_time_values - before, time_range) @@ -272,7 +302,9 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, o = .data_group$time_value %in% time_values num_ref_rows = sum(o) - # Count the number of appearances of each reference time value + # Count the number of appearances of each reference time value (these + # appearances should all be real for now, but if we allow ref time values + # outside of .data_group's time values): counts = .data_group %>% dplyr::filter(.data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% @@ -282,7 +314,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, !all(purrr::map_lgl(slide_values_list, is.data.frame))) { Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } - + # Unlist if appropriate: slide_values = if (as_list_col) { @@ -318,6 +350,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # fills with NA equivalent. vctrs::vec_slice(slide_values, o) = orig_values } else { + # This implicitly removes phony (`.real` == FALSE) observations. .data_group = filter(.data_group, o) } return(mutate(.data_group, !!new_col := slide_values)) @@ -325,9 +358,16 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # If f is not missing, then just go ahead, slide by group if (!missing(f)) { + if (rlang::is_formula(f)) f = as_slide_computation(f) + f_rtv_wrapper = function(x, g, ...) { + ref_time_value = min(x$time_value) + before + x <- x[x$.real,] + x$.real <- NULL + f(x, g, ref_time_value, ...) + } x = x %>% group_modify(slide_one_grp, - f = f, ..., + f = f_rtv_wrapper, ..., starts = starts, stops = stops, time_values = ref_time_values, @@ -347,7 +387,18 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } quo = quos[[1]] - f = function(x, quo, ...) rlang::eval_tidy(quo, x) + f = function(.x, .group_key, quo, ...) { + .ref_time_value = min(.x$time_value) + before + .x <- .x[.x$.real,] + .x$.real <- NULL + data_mask = rlang::as_data_mask(.x) + # We'll also install `.x` directly, not as an `rlang_data_pronoun`, so + # that we can, e.g., use more dplyr and epiprocess operations. + data_mask$.x = .x + data_mask$.group_key = .group_key + data_mask$.ref_time_value = .ref_time_value + rlang::eval_tidy(quo, data_mask) + } new_col = sym(names(rlang::quos_auto_name(quos))) x = x %>% @@ -365,5 +416,15 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, if (!as_list_col) { x = unnest(x, !!new_col, names_sep = names_sep) } + + # Remove any remaining phony observations. When `all_rows` is TRUE, phony + # observations aren't necessarily removed in `slide_one_grp`. + if (all_rows) { + x <- x[x$.real,] + } + + # Drop helper column `.real`. + x$.real <- NULL + return(x) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index cc95fa5f..33c3a7fb 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -40,7 +40,10 @@ If \code{f} is missing, then \code{...} will specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified via \code{f}. Alternatively, if \code{f} is missing, then the \code{...} is interpreted as -an expression for tidy evaluation. See details.} +an expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, and +\code{.ref_time_value}. See details.} \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be @@ -73,9 +76,9 @@ contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} \item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, the names of the resulting columns are given by prepending \code{new_col_name} to the names of the list elements.} diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 814c5e29..68ce52cc 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -79,9 +79,9 @@ contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} \item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, the names of the resulting columns are given by prepending \code{new_col_name} to the names of the list elements.} diff --git a/man/reexports.Rd b/man/reexports.Rd index b633e86c..46e961d9 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -23,7 +23,7 @@ below to see their documentation. \describe{ \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} - \item{tidyr}{\code{\link[tidyr:nest]{unnest}}} + \item{tidyr}{\code{\link[tidyr]{unnest}}} \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} }} diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 21191a0b..23bab72f 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -9,7 +9,16 @@ ungrouped = dplyr::bind_rows( as_epi_df() grouped = ungrouped %>% group_by(geo_value) -f = function(x, g) dplyr::tibble(value=mean(x$value), count=length(x$value)) + +small_x = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value=11:15), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +) %>% + as_epi_df(as_of = d + 6) %>% + group_by(geo_value) + + +f = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) toy_edf = tibble::tribble( ~geo_value, ~time_value, ~value , @@ -163,10 +172,10 @@ test_that("computation output formats x as_list_col", { }) test_that("epi_slide alerts if the provided f doesn't take enough args", { - f_xg = function(x, g) dplyr::tibble(value=mean(x$value), count=length(x$value)) + f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epi_slide(grouped, f_xg, before = 1L, ref_time_values = d+1), regexp = NA) - expect_warning(epi_slide(grouped, f_xg, before = 1L, ref_time_values = d+1), regexp = NA) + expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) + expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), @@ -301,3 +310,213 @@ test_that("`epi_slide` doesn't decay date output", { inherits("Date") ) }) + +test_that("basic grouped epi_slide computation produces expected output", { + # Also checks that we correctly remove extra rows and columns (`.real`) used + # to recover `ref_time_value`s. + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=cumsum(-(1:5))) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + # formula + result1 <- epi_slide(small_x, f = ~sum(.x$value), before=50) + expect_identical(result1, expected_output) + + # function + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before=50) + expect_identical(result2, expected_output) + + # dots + result3 <- epi_slide(small_x, slide_value = sum(value), before=50) + expect_identical(result3, expected_output) +}) + +test_that("ungrouped epi_slide computation completes successfully", { + expect_error( + small_x %>% + ungroup() %>% + epi_slide(before = 2, + slide_value = sum(.x$value)), + regexp=NA + ) +}) + +test_that("basic ungrouped epi_slide computation produces expected output", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)) + ) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + ungroup() %>% + filter(geo_value == "ak") %>% + epi_slide(before = 50, + slide_value = sum(.x$value)) + expect_identical(result1, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble( + geo_value = "ak", time_value = d + 1:5, value=11:15, slide_value=cumsum(11:15) + cumsum(-(1:5) + )), + dplyr::tibble( + geo_value = "al", time_value = d + 1:5, value=-(1:5), slide_value=cumsum(11:15) + cumsum(-(1:5)) + ) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result2 <- small_x %>% + ungroup() %>% + epi_slide(before = 50, + slide_value = sum(.x$value)) + expect_identical(result2, expected_output) +}) + +test_that("epi_slide computation via formula can use ref_time_value", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(f = ~ .ref_time_value, + before = 50) + + expect_identical(result1, expected_output) + + result2 <- small_x %>% + epi_slide(f = ~ .z, + before = 50) + + expect_identical(result2, expected_output) + + result3 <- small_x %>% + epi_slide(f = ~ ..3, + before = 50) + + expect_identical(result3, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result4 <- small_x %>% + ungroup() %>% + epi_slide(f = ~ .ref_time_value, + before = 50) + expect_identical(result4, expected_output) +}) + +test_that("epi_slide computation via function can use ref_time_value", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(f = function(x, g, t) t, + before = 2) + + expect_identical(result1, expected_output) +}) + +test_that("epi_slide computation via dots can use ref_time_value and group", { + # ref_time_value + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(before = 50, + slide_value = .ref_time_value) + + expect_identical(result1, expected_output) + + # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and + # `.env`. + expect_error(small_x %>% + epi_slide(before = 50, + slide_value = .env$.ref_time_value) + ) + + # group_key + # Use group_key column + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value="ak"), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value="al") + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result3 <- small_x %>% + epi_slide(before = 2, + slide_value = .group_key$geo_value) + + expect_identical(result3, expected_output) + + # Use entire group_key object + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=1L), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=1L) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result4 <- small_x %>% + epi_slide(before = 2, + slide_value = nrow(.group_key)) + + expect_identical(result4, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result5 <- small_x %>% + ungroup() %>% + epi_slide(before = 50, + slide_value = .ref_time_value) + expect_identical(result5, expected_output) +}) + +test_that("epi_slide computation via dots outputs the same result using col names and the data var", { + expected_output <- small_x %>% + epi_slide(before = 2, + slide_value = max(time_value)) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(before = 2, + slide_value = max(.x$time_value)) + + expect_identical(result1, expected_output) +}) + +test_that("`epi_slide` can access objects inside of helper functions", { + helper = function(archive_haystack, time_value_needle) { + archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(small_x, as.Date("2021-01-01")), + NA + ) +})