diff --git a/DESCRIPTION b/DESCRIPTION index c7a651d8..b008956d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.5 +Version: 0.11.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index e01468e1..a77fca82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(filter,epi_archive) S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) diff --git a/NEWS.md b/NEWS.md index a971623f..58cd16cc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `is_epi_archive` function has been reintroduced. - `epix_as_of_current()` introduced as an alias for `epix_as_of(.$versions_end)`. +- Added `dplyr::filter` implementation for `epi_archive`s. # epiprocess 0.11 diff --git a/R/archive.R b/R/archive.R index 922371f1..88f8d704 100644 --- a/R/archive.R +++ b/R/archive.R @@ -52,13 +52,6 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, class = "epiprocess__version_bound_mismatched_class" ) } - if (!identical(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same `typeof` as x$version, - which has a `typeof` of {typeof(x$version)}", - class = "epiprocess__version_bound_mismatched_typeof" - ) - } } return(invisible(NULL)) @@ -207,23 +200,23 @@ next_after.Date <- function(x) x + 1L #' undergo tiny nonmeaningful revisions and the archive object with the #' default setting is too large. #' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the *same* version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable under routine circumstances, such as (a) -#' today's version of one/all of the columns being published after initially -#' being filled with `NA` or LOCF, (b) a buggy version of today's data being -#' published but then fixed and republished later in the day, or (c) data -#' pipeline delays (e.g., publisher uploading, periodic scraping, database -#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected -#' later in the day (or even on a different day) than expected; potential -#' causes vary between different data pipelines. The default value is `NA`, -#' which doesn't consider any versions to be clobberable. Another setting that -#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. -#' @param versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is +#' same `class` as `x$version`, or an `NA` of any `class`: specifically, +#' either (a) the earliest version that could be subject to "clobbering" +#' (being overwritten with different update data, but using the *same* version +#' tag as the old update data), or (b) `NA`, to indicate that no versions are +#' clobberable. There are a variety of reasons why versions could be +#' clobberable under routine circumstances, such as (a) today's version of +#' one/all of the columns being published after initially being filled with +#' `NA` or LOCF, (b) a buggy version of today's data being published but then +#' fixed and republished later in the day, or (c) data pipeline delays (e.g., +#' publisher uploading, periodic scraping, database syncing, periodic +#' fetching, etc.) that make events (a) or (b) reflected later in the day (or +#' even on a different day) than expected; potential causes vary between +#' different data pipelines. The default value is `NA`, which doesn't consider +#' any versions to be clobberable. Another setting that may be appropriate for +#' some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` as `x$version`: what is +#' the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be #' valid, and would indicate that we observed additional versions of the data #' beyond `max(x$version)`, but they all contained empty updates. (The default diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 8978bd62..faf3c128 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -80,19 +80,13 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, "`version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!identical(typeof(version), typeof(x$DT$version))) { - cli_abort( - "`version` must have the same `typeof` as `epi_archive$DT$version`." - ) - } assert_scalar(version, na.ok = FALSE) if (version > x$versions_end) { cli_abort("`version` must be at most `epi_archive$versions_end`.") } assert_scalar(min_time_value, na.ok = FALSE) min_time_value_inf <- is.infinite(min_time_value) && min_time_value < 0 - min_time_value_same_type <- typeof(min_time_value) == typeof(x$DT$time_value) & - class(min_time_value) == class(x$DT$time_value) + min_time_value_same_type <- identical(class(min_time_value), class(x$DT$time_value)) if (!min_time_value_inf && !min_time_value_same_type) { cli_abort("`min_time_value` must be either -Inf or a time_value of the same type and class as `epi_archive$time_value`.") @@ -941,9 +935,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) { if (!identical(class(max_version), class(x$DT$version))) { cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.") } - if (!identical(typeof(max_version), typeof(x$DT$version))) { - cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.") - } assert_scalar(max_version, na.ok = FALSE) if (max_version > x$versions_end) { cli_abort("`max_version` must be at most `epi_archive$versions_end`.") @@ -1020,3 +1011,163 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols data } + + + +#' [`dplyr::filter`] for `epi_archive`s +#' +#' @param .data an `epi_archive` +#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed +#' unless you use `.format_aware = TRUE`; see details. +#' @param .by as in [`dplyr::filter`] +#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See +#' details. +#' +#' @details +#' +#' By default, using the `version` column or measurement columns is disabled as +#' it's easy to get unexpected results. See if either [`epix_as_of`] or +#' [`epix_slide`] works for any version selection you have in mind: for version +#' selection, see the `version` or `.versions` args, respectively; for +#' measurement column-based filtering, try `filter`ing after `epix_as_of` or +#' inside the `.f` in `epix_slide()`. If they don't cover your use case, then +#' you can set `.format_aware = TRUE` to enable usage of these columns, but be +#' careful to: +#' * Factor in that `.data$DT` may have been converted into a compact format +#' based on diffing consecutive versions, and the last version of each +#' observation in `.data$DT` will always be carried forward to future +#' `version`s`; see details of [`as_epi_archive`]. +#' * Set `clobberable_versions_start` and `versions_end` of the result +#' appropriately after the `filter` call. They will be initialized with the +#' same values as in `.data`. +#' +#' `dplyr::filter` also has an optional argument `.preserve`, which should not +#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do +#' not currently support `dplyr::filter`. +#' +#' @examples +#' +#' # Filter to one location and a particular time range: +#' archive_cases_dv_subset %>% +#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) +#' +#' # Convert to weekly by taking the Saturday data for each week, so that +#' # `case_rate_7d_av` represents a Sun--Sat average: +#' archive_cases_dv_subset %>% +#' filter(as.POSIXlt(time_value)$wday == 6L) +#' +#' # Filtering involving the `version` column or measurement columns requires +#' # extra care. See epix_as_of and epix_slide instead for some common +#' # operations. One semi-common operation that ends up being fairly simple is +#' # treating observations as finalized after some amount of time, and ignoring +#' # any revisions that were made after that point: +#' archive_cases_dv_subset %>% +#' filter( +#' version <= time_value + as.difftime(60, units = "days"), +#' .format_aware = TRUE +#' ) +#' +#' @export +filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + if (.format_aware) { + out_tbl <- in_tbl %>% + filter(..., .by = {{ .by }}) + } else { + measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data)) + forbidden_colnames <- c("version", measurement_colnames) + out_tbl <- in_tbl %>% + filter( + # Add our own fake filter arg to the user's ..., to update the data mask + # to prevent `version` column usage. + { + # We should be evaluating inside the data mask. To disable both + # `version` and `.data$version` etc., we need to go to the ancestor + # environment containing the data mask's column bindings. This is + # likely just the parent env, but search to make sure, in a way akin + # to `<<-`: + e <- environment() + while (!identical(e, globalenv()) && !identical(e, emptyenv())) { # nolint:vector_logic_linter + if ("version" %in% names(e)) { + # This is where the column bindings are. Replace the forbidden ones. + # They are expected to be active bindings, so directly + # assigning has issues; `rm` first. + rm(list = forbidden_colnames, envir = e) + eval_env <- new.env(parent = asNamespace("epiprocess")) # see (2) below + delayedAssign( + "version", + cli_abort(c( + "Using `version` in `filter.epi_archive` may produce unexpected results.", + ">" = "See if `epix_as_of` or `epix_slide` would work instead.", + ">" = "If not, see `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_version"), + eval.env = eval_env, + assign.env = e + ) + for (measurement_colname in measurement_colnames) { + # Record current `measurement_colname` and set up execution for + # the promise for the error in its own dedicated environment, so + # that (1) `for` loop updating its value and `rm` cleanup don't + # mess things up. We can also (2) prevent changes to data mask + # ancestry (to involve user's quosure env rather than our + # quosure env) or contents (from edge case of user binding + # functions inside the mask) from potentially interfering by + # setting the promise's execution environment to skip over the + # data mask. + eval_env <- new.env(parent = asNamespace("epiprocess")) + eval_env[["local_measurement_colname"]] <- measurement_colname + delayedAssign( + measurement_colname, + cli_abort(c( + "Using `{format_varname(local_measurement_colname)}` + in `filter.epi_archive` may produce unexpected results.", + ">" = "See `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_measurement"), + eval.env = eval_env, + assign.env = e + ) + } + break + } + e <- parent.env(e) + } + # Don't mask similarly-named user objects in ancestor envs: + rm(list = c("e", "measurement_colname", "eval_env")) + TRUE + }, + ..., + .by = {{ .by }} + ) + } + # We could try to re-infer the geo_type, e.g., when filtering from + # national+state to just state. However, we risk inference failures such as + # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when + # working with non-USA data: + out_geo_type <- .data$geo_type + if (.data$time_type == "day") { + # We might be going from daily to weekly; re-infer: + out_time_type <- guess_time_type(out_tbl$time_value) + } else { + # We might be filtering weekly to a single time_value; avoid re-inferring to + # stay "week". Or in other cases, just skip inferring, as re-inferring is + # expected to match the input time_type: + out_time_type <- .data$time_type + } + # Even if they narrow down to just a single value of an other_keys column, + # it's probably still better (& simpler) to treat it as an other_keys column + # since it still exists in the result: + out_other_keys <- .data$other_keys + # `filter` makes no guarantees about not aliasing columns in its result when + # the filter condition is all TRUE, so don't setDT. + out_dtbl <- as.data.table(out_tbl, key = out_other_keys) + result <- new_epi_archive( + out_dtbl, + out_geo_type, out_time_type, out_other_keys, + # Assume version-related metadata unchanged; part of why we want to push + # back on filter expressions like `.data$version <= .env$as_of`: + .data$clobberable_versions_start, .data$versions_end + ) + # Filtering down rows while keeping all (ukey) columns should preserve ukey + # uniqueness. + result +} diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 7870dede..51f6cf33 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -532,7 +532,7 @@ sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { if (!"geo_value" %in% group_cols) { out <- out %>% mutate(geo_value = "total") %>% - relocate(.data$geo_value, .before = 1) + relocate("geo_value", .before = 1) } # The `geo_type` will be correctly inherited here by the following logic: diff --git a/_pkgdown.yml b/_pkgdown.yml index 2953039c..ebd73dc8 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -72,8 +72,9 @@ reference: - epix_as_of - epix_as_of_current - epix_slide - - epix_merge - revision_summary + - epix_merge + - filter.epi_archive - epix_fill_through_version - epix_truncate_versions_after - set_versions_end diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b92cd505..f91834f3 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -63,21 +63,21 @@ undergo tiny nonmeaningful revisions and the archive object with the default setting is too large.} \item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} +same \code{class} as \code{x$version}, or an \code{NA} of any \code{class}: specifically, +either (a) the earliest version that could be subject to "clobbering" +(being overwritten with different update data, but using the \emph{same} version +tag as the old update data), or (b) \code{NA}, to indicate that no versions are +clobberable. There are a variety of reasons why versions could be +clobberable under routine circumstances, such as (a) today's version of +one/all of the columns being published after initially being filled with +\code{NA} or LOCF, (b) a buggy version of today's data being published but then +fixed and republished later in the day, or (c) data pipeline delays (e.g., +publisher uploading, periodic scraping, database syncing, periodic +fetching, etc.) that make events (a) or (b) reflected later in the day (or +even on a different day) than expected; potential causes vary between +different data pipelines. The default value is \code{NA}, which doesn't consider +any versions to be clobberable. Another setting that may be appropriate for +some pipelines is \code{max_version_with_row_in(x)}.} \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being @@ -86,8 +86,8 @@ used to rename columns.} \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is +\item{versions_end}{Optional; length-1, same \code{class} as \code{x$version}: what is +the last version we have observed? The default is \code{max_version_with_row_in(x)}, but values greater than this could also be valid, and would indicate that we observed additional versions of the data beyond \code{max(x$version)}, but they all contained empty updates. (The default diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd new file mode 100644 index 00000000..5f9d72db --- /dev/null +++ b/man/filter.epi_archive.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{filter.epi_archive} +\alias{filter.epi_archive} +\title{\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s} +\usage{ +\method{filter}{epi_archive}(.data, ..., .by = NULL, .format_aware = FALSE) +} +\arguments{ +\item{.data}{an \code{epi_archive}} + +\item{...}{as in \code{\link[dplyr:filter]{dplyr::filter}}; using the \code{version} column is not allowed +unless you use \code{.format_aware = TRUE}; see details.} + +\item{.by}{as in \code{\link[dplyr:filter]{dplyr::filter}}} + +\item{.format_aware}{optional, \code{TRUE} or \code{FALSE}; default \code{FALSE}. See +details.} +} +\description{ +\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s +} +\details{ +By default, using the \code{version} column or measurement columns is disabled as +it's easy to get unexpected results. See if either \code{\link{epix_as_of}} or +\code{\link{epix_slide}} works for any version selection you have in mind: for version +selection, see the \code{version} or \code{.versions} args, respectively; for +measurement column-based filtering, try \code{filter}ing after \code{epix_as_of} or +inside the \code{.f} in \code{epix_slide()}. If they don't cover your use case, then +you can set \code{.format_aware = TRUE} to enable usage of these columns, but be +careful to: +\itemize{ +\item Factor in that \code{.data$DT} may have been converted into a compact format +based on diffing consecutive versions, and the last version of each +observation in \code{.data$DT} will always be carried forward to future +\code{version}s\verb{; see details of [}as_epi_archive`]. +\item Set \code{clobberable_versions_start} and \code{versions_end} of the result +appropriately after the \code{filter} call. They will be initialized with the +same values as in \code{.data}. +} + +\code{dplyr::filter} also has an optional argument \code{.preserve}, which should not +have an impact on (ungrouped) \code{epi_archive}s, and \code{grouped_epi_archive}s do +not currently support \code{dplyr::filter}. +} +\examples{ + +# Filter to one location and a particular time range: +archive_cases_dv_subset \%>\% + filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) + +# Convert to weekly by taking the Saturday data for each week, so that +# `case_rate_7d_av` represents a Sun--Sat average: +archive_cases_dv_subset \%>\% + filter(as.POSIXlt(time_value)$wday == 6L) + +# Filtering involving the `version` column or measurement columns requires +# extra care. See epix_as_of and epix_slide instead for some common +# operations. One semi-common operation that ends up being fairly simple is +# treating observations as finalized after some amount of time, and ignoring +# any revisions that were made after that point: +archive_cases_dv_subset \%>\% + filter( + version <= time_value + as.difftime(60, units = "days"), + .format_aware = TRUE + ) + +} diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index 23ddf021..1c7336b3 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -56,7 +56,7 @@ of the \code{versions_end} are removed. \code{min_waiting_period} should charact the typical time during which most significant revisions occur. The default of 60 days corresponds to a typical near-final value for case counts as reported in the context of insurance. To avoid this filtering, either set -to \code{NULL} or 0. This will be rounded up to the appropriate \code{time_type} if +to \code{NULL} or 0. A \code{difftime} will be rounded up to the appropriate \code{time_type} if necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{within_latest}{double between 0 and 1. Determines the threshold diff --git a/tests/testthat/_snaps/methods-epi_archive.md b/tests/testthat/_snaps/methods-epi_archive.md new file mode 100644 index 00000000..200e4202 --- /dev/null +++ b/tests/testthat/_snaps/methods-epi_archive.md @@ -0,0 +1,35 @@ +# filter.epi_archive works as expected + + Code + ea2 %>% filter(version <= as.Date("2020-06-02")) + Condition + Error in `filter()`: + i In argument: `version <= as.Date("2020-06-02")`. + Caused by error: + ! Using `version` in `filter.epi_archive` may produce unexpected results. + > See if `epix_as_of` or `epix_slide` would work instead. + > If not, see `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L) + Condition + Error in `filter()`: + i In argument: `cases >= 2L`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2p %>% filter(cases >= median(cases), .by = geo_value) + Condition + Error in `filter()`: + i In argument: `cases >= median(cases)`. + i In group 1: `geo_value = "ca"`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index ee500d30..878cde1c 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -71,14 +71,21 @@ test_that("`validate_version_bound` validate and class checks together allow and # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same `class`") expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same `class`") - expect_error(validate_version_bound( - `class<-`(list(2), "clazz"), - tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same `typeof`", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) + # Maybe questionable, but accept to relax things a bit, as this is happening + # with Dates in some R(?) versions. Might need to turn some things into + # vec_cast_common, but idea is just make Date stuff work for now: + validate_version_bound( + `class<-`(list(2), "clazz"), + tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" + ) # Good: + validate_version_bound( + `class<-`(2, "Date"), + tibble::tibble(version = `class<-`(5L, "Date")), TRUE, "vb" + ) validate_version_bound(my_int, x_int, TRUE, "vb") validate_version_bound(my_dbl, x_dbl, TRUE, "vb") validate_version_bound(my_list, x_list, TRUE, "vb") diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 0aa4aca7..b9be125d 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -610,7 +610,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 2L after <- 1L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -627,7 +627,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-01-08")) ) ) - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = weeks) %>% @@ -677,7 +677,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 5L after <- 0L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -701,7 +701,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 0L after <- 3L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 26f0e769..c544368a 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -165,3 +165,142 @@ test_that("epix_as_of_now works as expected", { 2027 ) }) + +test_that("filter.epi_archive works as expected", { + ea2 <- ea2_data %>% + as_epi_archive() + + # Some basic output value checks: + + expect_equal( + ea2 %>% filter(geo_value == "tn"), + new_epi_archive( + ea2$DT[FALSE], + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + expect_equal( + ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")), + new_epi_archive( + data.table::data.table( + geo_value = "ca", time_value = as.Date("2020-06-02"), + version = as.Date("2020-06-02") + 0:2, cases = 0:2 + ), + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + # Output geo_type and time_type behavior: + + hrr_day_ea <- tibble( + geo_value = c(rep(1, 14), 100), + time_value = as.Date("2020-01-01") - 1 + c(1:14, 14), + version = time_value + 3, + value = 1:15 + ) %>% + as_epi_archive() + + expect_equal(hrr_day_ea$geo_type, "hrr") + expect_equal(hrr_day_ea$time_type, "day") + + hrr_week_ea <- hrr_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(hrr_week_ea$geo_type, "hrr") + expect_equal(hrr_week_ea$time_type, "week") + + hrr_one_week_ea <- hrr_week_ea %>% + filter(time_value == time_value[[1]]) + + expect_equal(hrr_one_week_ea$time_type, "week") + + intcustom_day_ea <- hrr_day_ea + intcustom_day_ea$geo_type <- "custom" + + intcustom_week_ea <- intcustom_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(intcustom_week_ea$geo_type, "custom") + expect_equal(intcustom_week_ea$time_type, "week") + + # Environment variables should be fine: + version <- as.Date("2020-06-02") + 1 + e <- version + expected <- ea2 %>% filter(geo_value == "ca", as.Date("2020-06-02") + 1 <= time_value) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$version <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", e <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$e <= time_value), expected) + + # Error-raising: + expect_error( + ea2 %>% filter(version == as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_error( + ea2 %>% filter(version <= as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_snapshot( + ea2 %>% filter(version <= as.Date("2020-06-02")), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_snapshot( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + # Check for `for` + `delayedAssign` mishap in `expect_snapshot` (we should say + # something about `cases` (the relevant colname), not `deaths` (the last + # measurement colname)): + ea2p <- ea2_data %>% + mutate(deaths = 0) %>% + as_epi_archive() + expect_error( + ea2p %>% filter(cases >= median(cases), .by = geo_value), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_snapshot( + ea2p %>% filter(cases >= median(cases), .by = geo_value), + error = TRUE, cnd_class = TRUE + ) + # Check that we are insulated from other lazy eval traps: + expected <- rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value)) + expect_class(expected$parent, "epiprocess__filter_archive__used_measurement") + with(list(cli_abort = function(...) stop("now, pretend user didn't have cli attached")), { + expect_equal( + rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value))$parent$message, + expected$parent$message + ) + }) + expect_equal( + rlang::catch_cnd(ea2p %>% filter( + { + c <- function(...) stop("and that they overwrote `c` to try to debug their own code") + cases >= median(cases) + }, + .by = geo_value + ))$parent$message, + expected$parent$message + ) + + + # Escape hatch: + expect_equal( + ea2 %>% + filter(version <= time_value + as.difftime(1, units = "days"), + .format_aware = TRUE + ) %>% + .$DT, + ea2$DT[version <= time_value + as.difftime(1, units = "days"), ] + ) +}) diff --git a/tests/testthat/test-time-utils.R b/tests/testthat/test-time-utils.R index 6fe8d78a..7ddd70c0 100644 --- a/tests/testthat/test-time-utils.R +++ b/tests/testthat/test-time-utils.R @@ -17,11 +17,11 @@ test_that("guess_period works", { # On Dates: daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") - expect_identical( + expect_equal( daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), daily_dates ) - expect_identical( + expect_equal( weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), weekly_dates )