diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f4b17a4b..2fca5dbd 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 087f0b05..847176d3 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] release: types: [published] workflow_dispatch: diff --git a/DESCRIPTION b/DESCRIPTION index 6fecc73c..d3d100a9 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.6.0 +Version: 0.6.0.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), @@ -62,7 +62,7 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ diff --git a/NAMESPACE b/NAMESPACE index 065302d7..ced7195b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) @@ -17,6 +18,7 @@ S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) S3method(next_after,Date) S3method(next_after,integer) @@ -99,6 +101,7 @@ importFrom(rlang,sym) importFrom(rlang,syms) importFrom(stats,cor) importFrom(stats,median) +importFrom(tibble,as_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) diff --git a/NEWS.md b/NEWS.md index 4745f09a..aea09f44 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,30 @@ Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but any inter-release development versions will include an additional ".9999" suffix. +## Breaking changes: + +* `epix_slide` has been made more like `dplyr::group_modify`. It will no longer + perform element/row recycling for size stability, accepts slide computation + outputs containing any number of rows, and no longer supports `all_rows`. + * To keep the old behavior, manually perform row recycling within `f` + computations, and/or `left_join` a data frame representing the desired + output structure with the current `epix_slide()` result to obtain the + desired repetitions and completions expected with `all_rows = TRUE`. +* `epix_slide` will only output grouped or ungrouped tibbles. Previously, it + would sometimes output `epi_df`s, but not consistently, and not always with + the metadata desired. Future versions will revisit this design, and consider + more closely whether/when/how to output an `epi_df`. + * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` + when desired and set the metadata appropriately. + +## Improvements: + +* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide + computations output atomic vectors, and output a list column in "chopped" + format (see `tidyr::chop`). +* `epi_slide` now works properly with slide computations that output just a + `Date` vector, rather than converting `slide_value` to a numeric column. + # epiprocess 0.6.0 ## Breaking changes: @@ -23,6 +47,13 @@ inter-release development versions will include an additional ".9999" suffix. * Slide functions now keep any grouping of `x` in their results, like `mutate` and `group_modify`. * To obtain the old behavior, `dplyr::ungroup` the slide results immediately. +* Additional `epi_slide` changes: + * When using `as_list_col = TRUE` together with `ref_time_values` and + `all_rows=TRUE`, the marker for excluded computations is now a `NULL` entry + in the list column, rather than a `NA`; if you are using `tidyr::unnest()` + afterward and want to keep these missing data markers, you will need to + replace the `NULL` entries with `NA`s. Skipped computations are now more + uniformly detectable using `vctrs` methods. * Additional`epix_slide` changes: * `epix_slide`'s `group_by` argument has been replaced by `dplyr::group_by` and `dplyr::ungroup` S3 methods. The `group_by` method uses "data masking" (also diff --git a/R/archive.R b/R/archive.R index d5be0ed4..5897fc4d 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,7 +307,7 @@ epi_archive = Abort("compactify must be boolean or null.") } - # Apply defaults and conduct checks and apply defaults for + # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: if (missing(clobberable_versions_start)) { clobberable_versions_start <- NA @@ -640,7 +640,7 @@ epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: @@ -649,7 +649,7 @@ epi_archive = before = before, ref_time_values = ref_time_values, time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions + all_versions = all_versions ) %>% # We want a slide on ungrouped archives to output something # ungrouped, rather than retaining the trivial (0-variable) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 76b079a4..b17bcd98 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -115,11 +115,11 @@ grouped_epi_archive = cat("Public `grouped_epi_archive` R6 methods:\n") grouped_method_names = names(grouped_epi_archive$public_methods) ungrouped_method_names = names(epi_archive$public_methods) - writeLines(wrap_varnames(initial = "• Specialized `epi_archive` methods: ", + writeLines(wrap_varnames(initial = "\u2022 Specialized `epi_archive` methods: ", intersect(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "• Exclusive to `grouped_epi_archive`: ", + writeLines(wrap_varnames(initial = "\u2022 Exclusive to `grouped_epi_archive`: ", setdiff(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "• `ungroup` to use: ", + writeLines(wrap_varnames(initial = "\u2022 `ungroup` to use: ", setdiff(ungrouped_method_names, grouped_method_names))) } # Return self invisibly for convenience in `$`-"pipe": @@ -190,7 +190,11 @@ grouped_epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. if ("group_by" %in% nse_dots_names(...)) { Abort(" The `group_by` argument to `slide` has been removed; please use @@ -200,12 +204,18 @@ grouped_epi_archive = this check is a false positive, but you will still need to use a different column name here and rename the resulting column after the slide.) - ") + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } if (missing(ref_time_values)) { - versions_with_updates = c(private$ungrouped$DT$version, private$ungrouped$versions_end) - ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + ref_time_values = epix_slide_ref_time_values_default(private$ungrouped) } else if (length(ref_time_values) == 0L) { Abort("`ref_time_values` must have at least one element.") } else if (any(is.na(ref_time_values))) { @@ -252,30 +262,14 @@ grouped_epi_archive = if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { Abort("`names_sep` must be a (single) string or NULL.") } - if (!rlang::is_bool(all_rows)) { - Abort("`all_rows` must be TRUE or FALSE.") - } if (!rlang::is_bool(all_versions)) { Abort("`all_versions` must be TRUE or FALSE.") } - # Each computation is expected to output a data frame with either - # one element/row total or one element/row per encountered - # nongrouping, nontime, nonversion key value. These nongrouping, - # nontime, nonversion key columns can be seen as the "effective" key - # of the computation; the computation might return an object that - # reports a different key or no key, but the "effective" key should - # still be a valid unique key for the data, and is something that we - # could use even with `.keep = FALSE`. - comp_effective_key_vars = - setdiff(key(private$ungrouped$DT), - c(private$vars, "time_value", "version")) - # Computation for one group, one time value comp_one_grp = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # Carry out the specified computation comp_value = f(.data_group, .group_key, ...) @@ -287,77 +281,18 @@ grouped_epi_archive = .data_group = .data_group$DT } - # Calculate the number of output elements/rows we expect the - # computation to output: one per distinct "effective computation - # key variable" value encountered in the input. - # - # Note: this mirrors how `epi_slide` does things if we're using - # unique keys, but can diverge if using nonunique keys. The - # `epi_slide` approach of counting occurrences of the - # `ref_time_value` in the `time_value` column, which helps lines - # up the computation results with corresponding rows of the - # input data, wouldn't quite apply here: we'd want to line up - # with rows (from the same group) with `version` matching the - # `ref_time_value`, but would still need to summarize these rows - # somehow and drop the `time_value` input column, but this - # summarization requires something like a to-be-unique output - # key to determine a sensible number of rows to output (and the - # contents of those rows). - count = - if (length(comp_effective_key_vars) != 0L) { - comp_effective_key_vals_in_comp_input = - if (data.table::is.data.table(.data_group)) { - .data_group[, comp_effective_key_vars, with=FALSE] - } else { - .data_group[, comp_effective_key_vars] - } - sum(!duplicated(comp_effective_key_vals_in_comp_input)) - } else { - # Same idea as above, but accounting for `duplicated` working - # differently (outputting `logical(0)`) on 0-column inputs - # rather than matching the number of rows. (Instead, we use - # the same count we would get if we were counting distinct - # values of a column defined as `rep(val, target_n_rows)`.) - if (nrow(.data_group) == 0L) { - 0L - } else { - 1L - } - } - - # If we get back an atomic vector - if (is.atomic(comp_value)) { - if (length(comp_value) == 1) { - comp_value = rep(comp_value, count) - } - # If not a singleton, should be the right length, else abort - else if (length(comp_value) != count) { - Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.') - } - } - - # If we get back a data frame - else if (is.data.frame(comp_value)) { - if (nrow(comp_value) == 1) { - comp_value = rep(list(comp_value), count) - } - # If not a single row, should be the right length, else abort - else if (nrow(comp_value) != count) { - Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.") - } - # Make into a list - else { - comp_value = split(comp_value, seq_len(nrow(comp_value))) - } - } - - # If neither an atomic vector data frame, then abort - else { + if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + comp_value <- list(comp_value) + # Label every result row with the `ref_time_value`: - return(tibble::tibble(time_value = rep(.env$ref_time_value, count), + return(tibble::tibble(time_value = .env$ref_time_value, !!new_col := .env$comp_value)) } @@ -379,11 +314,12 @@ grouped_epi_archive = group_modify_fn = comp_one_grp } else { as_of_archive = as_of_raw - # We essentially want to `group_modify` the archive, but don't - # provide an implementation yet. Next best would be - # `group_modify` on its `$DT`, but that has different behavior - # based on whether or not `dtplyr` is loaded. Instead, go - # through a , trying to avoid copies. + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: as_of_archive$DT <- copy(as_of_archive$DT) @@ -391,12 +327,11 @@ grouped_epi_archive = dt_key = data.table::key(as_of_archive$DT) as_of_df = as_of_archive$DT data.table::setDF(as_of_df) - + # Convert each subgroup chunk to an archive before running the calculation. group_modify_fn = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -407,19 +342,17 @@ grouped_epi_archive = .data_group_archive$DT = .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col ) } } - + return( dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), .drop=private$drop) %>% dplyr::group_modify(group_modify_fn, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col, .keep = TRUE) ) @@ -459,7 +392,7 @@ grouped_epi_archive = # provide an implementation yet. Next best would be # `group_modify` on its `$DT`, but that has different behavior # based on whether or not `dtplyr` is loaded. Instead, go - # through a , trying to avoid copies. + # through an ordinary data frame, trying to avoid copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: as_of_archive$DT <- copy(as_of_archive$DT) @@ -472,7 +405,6 @@ grouped_epi_archive = group_modify_fn = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -483,7 +415,6 @@ grouped_epi_archive = .data_group_archive$DT = .data_group comp_one_grp(.data_group_archive, .group_key, f = f, quo = quo, ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col ) } @@ -501,18 +432,31 @@ grouped_epi_archive = ) }) } - - # Unnest if we need to + + # Unchop/unnest if we need to if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - - # Join to get all rows, if we need to, then return - if (all_rows) { - cols = c(private$vars, "time_value") - y = unique(private$ungrouped$DT[, ..cols]) - x = dplyr::left_join(y, x, by = cols) - } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + return(x) } ) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index a9b84c0c..4db7e45d 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -478,16 +478,17 @@ epix_detailed_restricted_mutate = function(.data, ...) { } else { # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, - # then `setDT`-ing it in place to be a `data.table`. The key should still be - # valid (assuming that the user did not explicitly alter `key(.data$DT)` or - # the columns by reference somehow within `...` tidyeval-style computations, - # or trigger refcount-1 alterations due to still having >1 refcounts on the - # columns), so in between, set the "sorted" attribute accordingly to prevent - # attempted sorting (including potential extra copies) or sortedness - # checking, then `setDT`. - out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) # tibble - data.table::setattr(out_DT, "sorted", data.table::key(.data$DT)) - data.table::setDT(out_DT, key=key(.data$DT)) + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key=key(.data$DT)) out_archive = .data$clone() out_archive$DT <- out_DT request_names = names(col_modify_cols) @@ -512,13 +513,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' @param .data An `epi_archive` or `grouped_epi_archive` #' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * In `group_by`: unquoted variable name(s) or other ["data -#' masking"][dplyr::dplyr_data_masking] expression(s). It's possible to use -#' [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to #' perform grouping, but note that, if you are regrouping an already-grouped #' `.data` object, the calculations will be carried out ignoring such grouping #' (same as [in dplyr][dplyr::group_by]). -#' * In `ungroup`: either +#' * For `ungroup`: either #' * empty, in order to remove the grouping and output an `epi_archive`; or #' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] #' expression(s), in order to remove the matching variables from the list of @@ -527,12 +528,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' the variable selection from `...` only; if `TRUE`, the output will be #' grouped by the current grouping variables plus the variable selection from #' `...`. -#' @param .drop As in [`dplyr::group_by`]; determines treatment of factor -#' columns. -#' @param x a `grouped_epi_archive`, or, in `is_grouped_epi_archive`, any object -#' @param .tbl An `epi_archive` or `grouped_epi_archive` (`epi_archive` -#' dispatches to the S3 default method, and `grouped_epi_archive` dispatches -#' its own S3 method) +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) #' #' @details #' @@ -614,19 +616,10 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): #' toy_archive %>% group_by(geo_value) %>% groups() #' -#' # `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -#' # the missing groups identified by `dplyr`, but the row-recycling rules will -#' # exclude the corresponding outputs of `f` from the output of the slide: -#' all.equal( -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup(), -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=TRUE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' ) +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() #' #' @importFrom dplyr group_by #' @export @@ -635,6 +628,21 @@ epix_detailed_restricted_mutate = function(.data, ...) { group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + if (!rlang::is_bool(.drop)) { + Abort("`.drop` must be TRUE or FALSE") + } + if (!.drop) { + grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + } else if (any(diff(grouping_col_is_factor) == -1L)) { + Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + } + } grouped_epi_archive$new(detailed_mutate[["archive"]], detailed_mutate[["request_names"]], drop = .drop) @@ -699,23 +707,21 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then the output will have one row per -#' combination of grouping variables and unique time values in the underlying -#' data table. Otherwise, there will be one row in the output for each time -#' value in `x` that acts as a reference time value. Default is `FALSE`. -#' @param all_versions If `all_versions = TRUE`, then `f` will be passed the -#' version history (all `version <= ref_time_value`) for rows having -#' `time_value` between `ref_time_value - before` and `ref_time_value`. -#' Otherwise, `f` will be passed only the most recent `version` for every -#' unique `time_value`. Default is `FALSE`. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, #' containing the reference time values for the slide computation, and a #' column named according to the `new_col_name` argument, containing the slide @@ -736,32 +742,36 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' ends); `epi_slide` windows extend from `before` time steps before a #' `ref_time_value` through `after` time steps after `ref_time_value`. #' 3. The input class and columns are similar but different: `epix_slide` -#' keeps all columns and the `epi_df`-ness of the first input to the -#' computation; `epi_slide` only provides the grouping variables in the second -#' input, and will convert the first input into a regular tibble if the -#' grouping variables include the essential `geo_value` column. +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) #' 4. The output class and columns are similar but different: `epix_slide()` #' returns a tibble containing only the grouping variables, `time_value`, and -#' the new column(s) from the slide computation `f`, whereas `epi_slide()` +#' the new column(s) from the slide computations, whereas `epi_slide()` #' returns an `epi_df` with all original variables plus the new columns from -#' the slide computation. -#' 5. Unless grouping by `geo_value` and all `other_keys`, there will be -#' row-recyling behavior meant to resemble `epi_slide`'s results, based on the -#' distinct combinations of `geo_value`, `time_value`, and all `other_keys` -#' present in the version data with `time_value` matching one of the -#' `ref_time_values`. However, due to reporting latency or reporting dropping -#' in and out, this may not exactly match the behavior of "corresponding" -#' `epi_df`s. -#' 6. Similar to the row recyling, while `all_rows=TRUE` is designed to mimic -#' `epi_slide` by completing based on distinct combinations of `geo_value`, -#' `time_value`, and all `other_keys` present in the version data with -#' `time_value` matching one of the `ref_time_values`, this can have unexpected -#' behaviors due reporting latency or reporting dropping in and out. +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. #' 7. The `ref_time_values` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the #' `versions_end`, rather than the `time_value`s. -#' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are -#' the same. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch @@ -874,7 +884,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr epix_slide = function(x, f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay=TRUE)) { Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } @@ -884,11 +894,19 @@ epix_slide = function(x, f, ..., before, ref_time_values, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions )) } +#' Default value for `ref_time_values` in an `epix_slide` +#' +#' @noRd +epix_slide_ref_time_values_default = function(ea) { + versions_with_updates = c(ea$DT$version, ea$versions_end) + ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + return (ref_time_values) +} + #' Filter an `epi_archive` object to keep only older versions #' #' Generates a filtered `epi_archive` from an `epi_archive` object, keeping diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 503b8add..6429b867 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -1,3 +1,20 @@ +#' Convert to tibble +#' +#' Converts an `epi_df` object into a tibble, dropping metadata and any +#' grouping. +#' +#' @param x an `epi_df` +#' @param ... arguments to forward to `NextMethod()` +#' +#' @importFrom tibble as_tibble +#' @export +as_tibble.epi_df = function(x, ...) { + # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the + # grouping and should be called by `NextMethod()` in the current design. + # See #223 for discussion of alternatives. + decay_epi_df(NextMethod()) +} + #' Convert to tsibble format #' #' Converts an `epi_df` object into a tsibble, where the index is taken to be @@ -187,9 +204,11 @@ ungroup.epi_df = function(x, ...) { reclass(x, metadata) } -#' @method unnest epi_df +#' @method group_modify epi_df #' @rdname print.epi_df -#' @param data The `epi_df` object. +#' @param .data The `epi_df` object. +#' @param .f function or formula; see [`dplyr::group_modify`] +#' @param .keep Boolean; see [`dplyr::group_modify`] #' @export group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) diff --git a/R/slide.R b/R/slide.R index d8d6becb..ab591f99 100644 --- a/R/slide.R +++ b/R/slide.R @@ -52,17 +52,25 @@ #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. #' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output; otherwise, there will be one row for each time value in `x` -#' that acts as a reference time value. Default is `FALSE`. +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. #' @return An `epi_df` object given by appending a new column to `x`, named #' according to the `new_col_name` argument. #' @@ -253,11 +261,11 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, time_values = time_values[o] # Compute the slide values - slide_values = slider::hop_index(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .starts = starts, - .stops = stops) + slide_values_list = slider::hop_index(.x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .starts = starts, + .stops = stops) # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow @@ -270,49 +278,48 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, dplyr::count(.data$time_value) %>% dplyr::pull(n) - # If they're all atomic vectors - if (all(sapply(slide_values, is.atomic))) { - if (all(sapply(slide_values, length) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(unlist(slide_values), times = counts) - } - else { - # Unlist, then check its length, and abort if not right - slide_values = unlist(slide_values) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return atomic vectors, then they must each have a single element, or else one element per appearance of the reference time value in the local window.") - } - } + if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && + !all(purrr::map_lgl(slide_values_list, is.data.frame))) { + Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } - - # If they're all data frames - else if (all(sapply(slide_values, is.data.frame))) { - if (all(sapply(slide_values, nrow) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(slide_values, times = counts) + + # Unlist if appropriate: + slide_values = + if (as_list_col) { + slide_values_list + } else { + vctrs::list_unchop(slide_values_list) } - else { - # Split (each row on its own), check length, abort if not right - slide_df = dplyr::bind_rows(slide_values) - slide_values = split(slide_df, 1:nrow(slide_df)) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return data frames, then they must each have a single row, or else one row per appearance of the reference time value in the local window.") - } + + if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && + length(slide_values_list) != 0L) { + # Recycle to make size stable (one slide value per ref time value). + # (Length-0 case also could be handled here, but causes difficulties; + # leave it to the next branch, where it also belongs.) + slide_values = vctrs::vec_rep_each(slide_values, times = counts) + } else { + # Split and flatten if appropriate, perform a (loose) check on number of + # rows. + if (as_list_col) { + slide_values = purrr::list_flatten(purrr::map( + slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] + )) + } + if (vctrs::vec_size(slide_values) != num_ref_rows) { + Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") } } - - # If neither all atomic vectors or all data frames, then abort - else { - Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") - } - + # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { orig_values = slide_values - slide_values = rep(NA, nrow(.data_group)) - slide_values[o] = orig_values + slide_values = vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + # ^ using vctrs::vec_init would be shorter but docs don't guarantee it + # fills with NA equivalent. + vctrs::vec_slice(slide_values, o) = orig_values + } else { + .data_group = filter(.data_group, o) } - else .data_group = filter(.data_group, o) return(mutate(.data_group, !!new_col := slide_values)) } diff --git a/R/utils.R b/R/utils.R index d17f05d4..349c173a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -496,12 +496,12 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' Use max valid period as guess for `period` of `ref_time_values` #' -#' @param `ref_time_values` Vector containing time-interval-like or time-like +#' @param ref_time_values Vector containing time-interval-like or time-like #' data, with at least two distinct values, [`diff`]-able (e.g., a #' `time_value` or `version` column), and should have a sensible result from #' adding `is.numeric` versions of its `diff` result (via `as.integer` if its #' `typeof` is `"integer"`, otherwise via `as.numeric`). -#' @param `ref_time_values_arg` Optional, string; name to give `ref_time_values` +#' @param ref_time_values_arg Optional, string; name to give `ref_time_values` #' in error messages. Defaults to quoting the expression the caller fed into #' the `ref_time_values` argument. #' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` diff --git a/_pkgdown.yml b/_pkgdown.yml index d9e2ec79..bba3ea8d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -56,7 +56,7 @@ reference: desc: Details on `epi_archive`, and basic functionality. - contents: - matches("archive") -- title: `epix_*()` functions +- title: "`epix_*()` functions" desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - contents: - starts_with("epix") diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd new file mode 100644 index 00000000..c314f47e --- /dev/null +++ b/man/as_tibble.epi_df.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{as_tibble.epi_df} +\alias{as_tibble.epi_df} +\title{Convert to tibble} +\usage{ +\method{as_tibble}{epi_df}(x, ...) +} +\arguments{ +\item{x}{an \code{epi_df}} + +\item{...}{arguments to forward to \code{NextMethod()}} +} +\description{ +Converts an \code{epi_df} object into a tibble, dropping metadata and any +grouping. +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 8ae412a6..a4a58645 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -322,7 +322,6 @@ details. new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE )}\if{html}{\out{}} } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 1809e113..2a646670 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -71,19 +71,27 @@ return an object of class \code{lubridate::period}. For example, we can use contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output; otherwise, there will be one row for each time value in \code{x} -that acts as a reference time value. Default is \code{FALSE}.} +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending a new column to \code{x}, named diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 39dbba82..e9b755b9 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -14,7 +14,6 @@ epix_slide( new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE ) } @@ -74,26 +73,23 @@ would only be meaningful if \code{time_value} is of class \code{POSIXct}).} contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} -\item{all_rows}{If \code{all_rows = TRUE}, then the output will have one row per -combination of grouping variables and unique time values in the underlying -data table. Otherwise, there will be one row in the output for each time -value in \code{x} that acts as a reference time value. Default is \code{FALSE}.} - -\item{all_versions}{If \code{all_versions = TRUE}, then \code{f} will be passed the -version history (all \code{version <= ref_time_value}) for rows having -\code{time_value} between \code{ref_time_value - before} and \code{ref_time_value}. -Otherwise, \code{f} will be passed only the most recent \code{version} for every -unique \code{time_value}. Default is \code{FALSE}.} +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, @@ -126,34 +122,38 @@ time interval (e.g., day) are only published after that time interval ends); \code{epi_slide} windows extend from \code{before} time steps before a \code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. \item The input class and columns are similar but different: \code{epix_slide} -keeps all columns and the \code{epi_df}-ness of the first input to the -computation; \code{epi_slide} only provides the grouping variables in the second -input, and will convert the first input into a regular tibble if the -grouping variables include the essential \code{geo_value} column. +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) \item The output class and columns are similar but different: \code{epix_slide()} returns a tibble containing only the grouping variables, \code{time_value}, and -the new column(s) from the slide computation \code{f}, whereas \code{epi_slide()} +the new column(s) from the slide computations, whereas \code{epi_slide()} returns an \code{epi_df} with all original variables plus the new columns from -the slide computation. -\item Unless grouping by \code{geo_value} and all \code{other_keys}, there will be -row-recyling behavior meant to resemble \code{epi_slide}'s results, based on the -distinct combinations of \code{geo_value}, \code{time_value}, and all \code{other_keys} -present in the version data with \code{time_value} matching one of the -\code{ref_time_values}. However, due to reporting latency or reporting dropping -in and out, this may not exactly match the behavior of "corresponding" -\code{epi_df}s. -\item Similar to the row recyling, while \code{all_rows=TRUE} is designed to mimic -\code{epi_slide} by completing based on distinct combinations of \code{geo_value}, -\code{time_value}, and all \code{other_keys} present in the version data with -\code{time_value} matching one of the \code{ref_time_values}, this can have unexpected -behaviors due reporting latency or reporting dropping in and out. +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. \item The \code{ref_time_values} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the \code{versions_end}, rather than the \code{time_value}s. -Apart from this, the interfaces between \code{epix_slide()} and \code{epi_slide()} are -the same. } +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch properly-versioned snapshots from the data archive (via its \code{as_of()} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index dac7ba0d..aee0a07b 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -27,12 +27,13 @@ is_grouped_epi_archive(x) \item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); \itemize{ -\item In \code{group_by}: unquoted variable name(s) or other \link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to use -\code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to perform grouping, but note that, if you are regrouping an already-grouped \code{.data} object, the calculations will be carried out ignoring such grouping (same as \link[dplyr:group_by]{in dplyr}). -\item In \code{ungroup}: either +\item For \code{ungroup}: either \itemize{ \item empty, in order to remove the grouping and output an \code{epi_archive}; or \item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} @@ -46,14 +47,15 @@ the variable selection from \code{...} only; if \code{TRUE}, the output will be grouped by the current grouping variables plus the variable selection from \code{...}.} -\item{.drop}{As in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor -columns.} +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} -\item{x}{a \code{grouped_epi_archive}, or, in \code{is_grouped_epi_archive}, any object} +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} -\item{.tbl}{An \code{epi_archive} or \code{grouped_epi_archive} (\code{epi_archive} -dispatches to the S3 default method, and \code{grouped_epi_archive} dispatches -its own S3 method)} +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -137,18 +139,9 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): toy_archive \%>\% group_by(geo_value) \%>\% groups() -# `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -# the missing groups identified by `dplyr`, but the row-recycling rules will -# exclude the corresponding outputs of `f` from the output of the slide: -all.equal( - toy_archive \%>\% - group_by(geo_value, age_group, .drop=FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup(), - toy_archive \%>\% - group_by(geo_value, age_group, .drop=TRUE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() -) +toy_archive \%>\% + group_by(geo_value, age_group, .drop=FALSE) \%>\% + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() } diff --git a/man/guess_period.Rd b/man/guess_period.Rd index e27309d3..e03a1373 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -10,13 +10,13 @@ guess_period( ) } \arguments{ -\item{`ref_time_values`}{Vector containing time-interval-like or time-like +\item{ref_time_values}{Vector containing time-interval-like or time-like data, with at least two distinct values, \code{\link{diff}}-able (e.g., a \code{time_value} or \code{version} column), and should have a sensible result from adding \code{is.numeric} versions of its \code{diff} result (via \code{as.integer} if its \code{typeof} is \code{"integer"}, otherwise via \code{as.numeric}).} -\item{`ref_time_values_arg`}{Optional, string; name to give \code{ref_time_values} +\item{ref_time_values_arg}{Optional, string; name to give \code{ref_time_values} in error messages. Defaults to quoting the expression the caller fed into the \code{ref_time_values} argument.} } diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 878e7f18..f5749d82 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -17,7 +17,7 @@ \method{ungroup}{epi_df}(x, ...) -\method{unnest}{epi_df}(.data, .f, ..., .keep = FALSE) +\method{group_modify}{epi_df}(.data, .f, ..., .keep = FALSE) \method{unnest}{epi_df}(data, ...) } @@ -29,6 +29,12 @@ Currently unused.} \item{object}{The \code{epi_df} object.} +\item{.data}{The \code{epi_df} object.} + +\item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + +\item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + \item{data}{The \code{epi_df} object.} } \description{ diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R new file mode 100644 index 00000000..334b4488 --- /dev/null +++ b/tests/testthat/test-deprecations.R @@ -0,0 +1,48 @@ + +test_that("epix_slide group_by= deprecation works",{ + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + # + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) +}) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 84192f94..21191a0b 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -2,15 +2,23 @@ d <- as.Date("2020-01-01") -grouped = dplyr::bind_rows( +ungrouped = dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value=1:200), dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) ) %>% - as_epi_df() %>% + as_epi_df() +grouped = ungrouped %>% group_by(geo_value) - f = function(x, g) dplyr::tibble(value=mean(x$value), count=length(x$value)) +toy_edf = tibble::tribble( + ~geo_value, ~time_value, ~value , + "a" , 1:10 , 2L^( 1:10), + "b" , 1:10 , 2L^(11:20), + ) %>% + tidyr::unchop(c(time_value, value)) %>% + as_epi_df(as_of = 100) + ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values = d+3), @@ -87,6 +95,73 @@ test_that("these doesn't produce an error; the error appears only if the ref tim dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group }) +test_that("computation output formats x as_list_col", { + # See `toy_edf` definition at top of file. + # We'll try 7d sum with a few formats. + basic_result_from_size1 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_result_from_size1 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), + basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_result_from_size1 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), + basic_result_from_size1 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + # output naming functionality: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result"), + basic_result_from_size1 %>% rename(result_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL), + basic_result_from_size1 %>% rename(value_sum = slide_value) + ) + # trying with non-size-1 computation outputs: + basic_result_from_size2 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L, + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), + basic_result_from_size2 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), + basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), + basic_result_from_size2 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), + basic_result_from_size2 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) +}) + test_that("epi_slide alerts if the provided f doesn't take enough args", { f_xg = function(x, g) dplyr::tibble(value=mean(x$value), count=length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. @@ -97,3 +172,132 @@ test_that("epi_slide alerts if the provided f doesn't take enough args", { expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") }) + +test_that("`ref_time_values` + `all_rows = TRUE` works", { + # See `toy_edf` definition at top of file. We'll do variants of a slide + # returning the following: + basic_full_result = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + # slide computations returning atomic vecs: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_full_result + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L)), + basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) + ) + # slide computations returning data frames: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L)), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + # slide computations returning data frames with `as_list_col=TRUE`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, list(NULL))) + ) + # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + # XXX unclear exactly what we want in this case. Current approach is + # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` + # compatibility + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + rework_nulls = function(slide_values_list) { + vctrs::vec_assign( + slide_values_list, + vctrs::vec_detect_missing(slide_values_list), + list(vctrs::vec_cast(NA, vctrs::vec_ptype_common(!!!slide_values_list))) + ) + } + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + mutate(slide_value = rework_nulls(slide_value)) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) +}) + +test_that("`epi_slide` doesn't decay date output", { + expect_true( + ungrouped %>% + epi_slide(before = 5L, ~ as.Date("2020-01-01")) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 5eeb5c2c..34fef705 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -27,7 +27,6 @@ test_that("epix_slide works as intended",{ 2^6+2^3, 2^10+2^9, 2^15+2^14)) %>% - as_epi_df(as_of = 4) %>% # Also a bug (issue #213) group_by(geo_value) expect_identical(xx1,xx2) # * @@ -41,6 +40,114 @@ test_that("epix_slide works as intended",{ ) expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = function(x, g) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1,xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(sum_binary = sum(binary), + before = 2) + + expect_identical(xx1,xx5) +}) + +test_that("epix_slide works as intended with `as_list_col=TRUE`",{ + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE) + + xx_dfrow2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + c(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1,xx_dfrow2) # * + + xx_dfrow3 <- ( + xx + $group_by(dplyr::across(dplyr::all_of("geo_value"))) + $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE) + ) + + expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE) + + xx_df2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1,xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE) + + xx_scalar2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1,xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .x$binary, + before = 2, + as_list_col = TRUE) + + xx_vec2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1,xx_vec2) }) test_that("epix_slide `before` validation works", { @@ -180,16 +287,15 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss }) ea <- tibble::tribble(~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1)) %>% - tidyr::unnest(c(time_value,binary)) - -ea$geo_value <- "x" -ea <- as_epi_archive(ea) + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1)) %>% + tidyr::unnest(c(time_value,binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { library(data.table) @@ -239,6 +345,28 @@ test_that("epix_slide with all_versions option has access to all older versions" expect_identical(result1,result3) # This and * Imply result2 and result3 are identical + # formula interface + result4 <- ea %>% group_by() %>% + epix_slide(f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide(data = slide_fn( + .data$clone(), # hack to convert from pronoun back to archive + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) @@ -305,7 +433,7 @@ test_that("as_of and epix_slide with long enough window are compatible", { ) }) -test_that("epix_slide `f` is passed an ungrouped `epi_archive`",{ +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ slide_fn <- function(x, g) { expect_true(is_epi_archive(x)) return(NA) @@ -349,6 +477,91 @@ test_that("epix_slide with all_versions option works as intended",{ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical }) +# XXX currently, we're using a stopgap measure of having `epix_slide` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide works with 0-row computation outputs", { + epix_slide_empty = function(ea, ...) { + ea %>% + epix_slide(before = 5L, ..., function(x, g) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + test_that("epix_slide alerts if the provided f doesn't take enough args", { f_xg = function(x, g) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. @@ -359,3 +572,15 @@ test_that("epix_slide alerts if the provided f doesn't take enough args", { expect_warning(epix_slide(xx, f_x_dots, before = 2L), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") }) + +test_that("`epix_slide` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive() %>% + epix_slide(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R new file mode 100644 index 00000000..0423352e --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive.R @@ -0,0 +1,88 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive = + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) %>% + as_epi_archive(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys = toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + ) + grouping_cols = c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "\\.drop.*TRUE or FALSE") + expect_warning(toy_archive %>% group_by(.drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + expect_warning(grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + expect_identical(grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop=FALSE)) + expect_identical(toy_archive %>% + group_by(geo_value, age_group, .drop=FALSE) %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop=FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index de43d7c2..9d03cf93 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -116,3 +116,11 @@ test_that("Correct metadata when subset includes some of other_keys", { # Including both original other_keys was already tested above }) +test_that("Metadata and grouping are dropped by `as_tibble`", { + grouped_converted = toy_epi_df %>% + group_by(geo_value) %>% + as_tibble() + expect_true( + !any(c("metadata", "groups") %in% names(attributes(grouped_converted))) + ) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 9bf61fe4..02288905 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -7,21 +7,71 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with computations that have advanced output structures. - -In general, the functions `epi_slide()` and `epix_slide()` do what they can to -ensure the result of a slide operation is *size stable*, meaning, it will return -something whose length is the same as the number of appearances of reference -time values for the slide computation in the given data frame/table (this -defaults to all time values, but can be some given subset when `ref_time_values` -is specified). - -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. -Below we demonstrate some advanced use cases of sliding with these output -structures. We focus on `epi_slide()` for the most part, though the behavior we -demonstrate also carries over to `epix_slide()`. +In this vignette, we discuss how to use the sliding functionality in the +`epiprocess` package with less common grouping schemes or with computations that +have advanced output structures. +The output of a slide computation should either be an atomic value/vector, or a +data frame. This data frame can have multiple columns, multiple rows, or both. + +During basic usage (e.g., when all optional arguments are set to their defaults): + +* `epi_slide(edf, , .....)`: + + * keeps **all** columns of `edf`, adds computed column(s) + * outputs **one row per row in `edf`** (recycling outputs from + computations appropriately if there are multiple time series bundled + together inside any group(s)) + * maintains the grouping or ungroupedness of `edf` + * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by + `dplyr::arrange(time_value, .by_group = TRUE)`** + * outputs an **`epi_df`** if the required columns are present, otherwise a + tibble + +* `epix_slide(ea, , .....)`: + + * keeps **grouping and `time_value`** columns of `ea`, adds computed + column(s) + * outputs **any number of rows** (computations are allowed to output any + number of elements/rows, and no recycling is performed) + * maintains the grouping or ungroupedness of `ea`, unless it was explicitly + grouped by zero variables; this isn't supported by `grouped_df` and it will + automatically turn into an ungrouped tibble + * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** + * outputs a **tibble** + +These differences in basic behavior make some common slide operations require less boilerplate: +* predictors and targets calculated with `epi_slide` are automatically lined up + with each other and with the signals from which they were calculated; and +* computations for an `epix_slide` can output data frames with any number of + rows, containing models, forecasts, evaluations, etc., and will not be + recycled. + +When using more advanced features, more complex rules apply: + +* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` + will output one row for every row in `edf` with `time_value` appearing inside + `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` + followed by `dplyr::filter` to those `ref_time_values`. We call this property + **size stability**, and describe how it is achieved in the following sections. + The default behavior described above is a special case of this general rule + based on a default value of `ref_time_values`. +* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, + all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but + instead will output one row per row in `edf`. +* Exception/feature: `epi_slide(edf, ....., as_list_col=TRUE)` will format the + output to add a single list-class computed column. +* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will format the + output to have one row per computation and a single list-class computed column + (in addition to the grouping variables and `time_value`), as if we had used + `tidyr::chop()` or `tidyr::nest()`. +* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% + epix_slide(, .....)` will call the computation on any missing + groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional + output rows. + +Below we demonstrate some advanced use cases of sliding with different output +structures. We focus on `epi_slide()` for the most part, though some of the +behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs @@ -35,7 +85,7 @@ simple synthetic example. library(epiprocess) library(dplyr) -df <- tibble( +edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), @@ -44,29 +94,31 @@ df <- tibble( as_epi_df() # 2-day trailing average, per geo value -df %>% +edf %>% group_by(geo_value) %>% epi_slide(x_2dav = mean(x), before = 1) %>% ungroup() # 2-day trailing average, marginally -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -80,7 +132,7 @@ so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -df %>% +edf %>% epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` @@ -89,7 +141,7 @@ is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -df %>% +edf %>% epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` @@ -103,15 +155,15 @@ we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -df2 <- df %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = TRUE) %>% ungroup() -class(df2$a) -length(df2$a) -df2$a[[2]] +class(edf2$a) +length(edf2$a) +edf2$a[[2]] ``` When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function @@ -122,7 +174,7 @@ list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE) %>% @@ -133,7 +185,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) %>% @@ -144,19 +196,19 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL) -df %>% +edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -174,32 +226,51 @@ Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. This can be convenient for modeling in the following sense: we can, for example, -fit a sliding forecasting model by pooling data from different locations, and -then return separate forecasts from this common model for each location. We use -our synthetic example to demonstrate this idea abstractly but simply. +fit a sliding, data-versioning-unaware nowcasting or forecasting model by +pooling data from different locations, and then return separate forecasts from +this common model for each location. We use our synthetic example to demonstrate +this idea abstractly but simply by forecasting (actually, nowcasting) `y` from +`x` by fitting a time-windowed linear model that pooling data across all +locations. ```{r} -df$y <- 2 * df$x + 0.05 * rnorm(length(df$x)) +edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) -df %>% +edf %>% epi_slide(function(d, ...) { obj <- lm(y ~ x, data = d) return( as.data.frame( predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), + group_by(geo_value) %>% + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) }, before = 1, new_col_name = "fc", names_sep = NULL) ``` +The above example focused on simplicity to show how to work with multi-row +outputs. Note however, the following issues in this example: + +* The `lm` fitting data includes the testing instances, as no training-test split was performed. +* Adding a simple training-test split would not factor in reporting latency properly. +* Data revisions are not taken into account. + +All three of these factors contribute to unrealistic retrospective forecasts and +overly optimistic retrospective performance evaluations. Instead, one should +favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using +`epix_slide` also makes it easier to express certain types of forecasts; while +in `epi_slide`, forecasts for additional aheads or quantile levels would need to +be expressed as additional columns, or nested inside list columns, `epix_slide` +does not perform size stability checks or recycling, allowing computations to +output any number of rows. + ## Version-aware forecasting, revisited -Finally, we revisit the COVID-19 forecasting example from the [archive +We revisit the COVID-19 forecasting example from the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the last point in a more realistic setting. First, we fetch the -versioned data and build the archive. +to demonstrate the preceding points regarding forecast evaluation in a more +realistic setting. First, we fetch the versioned data and build the archive. ```{r, message = FALSE, warning = FALSE, eval =FALSE} library(epidatr)