Skip to content

Add filter.epi_archive #651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 16, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ S3method(epix_slide,epi_archive)
S3method(epix_slide,grouped_epi_archive)
S3method(epix_truncate_versions_after,epi_archive)
S3method(epix_truncate_versions_after,grouped_epi_archive)
S3method(filter,epi_archive)
S3method(group_by,epi_archive)
S3method(group_by,epi_df)
S3method(group_by,grouped_epi_archive)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's.

# epiprocess 0.12

## New features

- Added `dplyr::filter` implementation for `epi_archive`s.

# epiprocess 0.11

## Breaking changes
Expand Down
133 changes: 133 additions & 0 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -984,3 +984,136 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols
data
}



#' [`dplyr::filter`] for `epi_archive`s
#'
#' @param .data an `epi_archive`
#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
#' unless you use `.format_aware = TRUE`; see details.
#' @param .by as in [`dplyr::filter`]
#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
#' details.
#'
#' @details
#'
#' By default, using the `version` column or measurement columns is disabled as
#' it's easy to get unexpected results. See if either [`epix_as_of`] or
#' [`epix_slide`] works as an alternative. If they don't cover your use case,
#' then you can set `.format_aware = TRUE` to enable usage of these columns, but
#' be careful to:
#' * Factor in that `.data$DT` may have been converted into a compact format
#' based on diffing consecutive versions, and the last version of each
#' observation in `.data$DT` will always be carried forward to future
#' `version`s`; see details of [`as_epi_archive`].
#' * Set `clobberable_versions_start` and `versions_end` of the result
#' appropriately after the `filter` call. They will be initialized with the
#' same values as in `.data`.
#'
#' `dplyr::filter` also has an optional argument `.preserve`, which should not
#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
#' not currently support `dplyr::filter`.
#'
#' @examples
#'
#' # Filter to one location and a particular time range:
#' archive_cases_dv_subset %>%
#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
#'
#' # Convert to weekly by taking the Saturday data for each week, so that
#' # `case_rate_7d_av` represents a Sun--Sat average:
#' archive_cases_dv_subset %>%
#' filter(as.POSIXlt(time_value)$wday == 6L)
#'
#' # Filtering involving the `version` column or measurement columns requires
#' # extra care. See epix_as_of and epix_slide instead for some common
#' # operations. One semi-common operation that ends up being fairly simple is
#' # treating observations as finalized after some amount of time, and ignoring
#' # any revisions that were made after that point:
#' archive_cases_dv_subset %>%
#' filter(
#' version <= time_value + as.difftime(60, units = "days"),
#' .format_aware = TRUE
#' )
#'
#' @export
filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) {
in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal")
if (.format_aware) {
out_tbl <- in_tbl %>%
filter(..., .by = {{ .by }})
} else {
measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data))
forbidden_colnames <- c("version", measurement_colnames)
out_tbl <- in_tbl %>%
filter(
# Add our own fake filter arg to the user's ..., to update the data mask
# to prevent `version` column usage.
{
# We should be evaluating inside the data mask. To disable both
# `version` and `.data$version` etc., we need to go to the ancestor
# environment containing the data mask's column bindings. This is
# likely just the parent env, but search to make sure, in a way akin
# to `<<-`:
e <- environment()
while (!identical(e, globalenv()) && !identical(e, emptyenv())) {
if ("version" %in% names(e)) {
# This is where the column bindings are. Replace the forbidden ones.
# They are expected to be active bindings, so directly
# assigning has issues; `rm` first.
rm(list = forbidden_colnames, envir = e)
delayedAssign("version", cli::cli_abort(c(
"Using `version` in `filter.epi_archive` may produce unexpected results.",
">" = "See if `epix_as_of` or `epix_slide` would work instead.",
">" = "If not, see `?filter.epi_archive` details for how to proceed."
), class = "epiprocess__filter_archive__used_version"), assign.env = e)
for (measurement_colname in measurement_colnames) {
delayedAssign(measurement_colname, cli::cli_abort(c(
"Using `{format_varname(measurement_colname)}`
in `filter.epi_archive` may produce unexpected results.",
">" = "See `?filter.epi_archive` details for how to proceed."
), class = "epiprocess__filter_archive__used_measurement"), assign.env = e)
}
break
}
e <- parent.env(e)
}
TRUE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it will error if you use it inside a function that has "version" or "time_value" defined in its environment? I'm reading this as traversing up the environment chain and stopping short of the globalenv(), which would be most likely to have variables like that defined, but intermediate scopes might still have false positives.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, I tested it locally and it seems fine. I guess this works because we hit the data mask environment first and break out before we hit the user's function environment? Seems reasonable.

Copy link
Contributor Author

@brookslogan brookslogan Apr 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, the data mask environment chain in current dplyr&rlang looks something like
rlang wrapper env -> data bindings env (/ env chain input to as_data_mask env) -> quosure env (chain)
where:

  • rlang wrapper env holds the real data pronoun objects, a ~ override I don't quite understand, and some other internals
  • data bindings env is typically just a single env holding (group's) column bindings; in other contexts, it could be an env chain we fed into as_data_mask (with its "top" ancestor reassigned to point at:)
  • the quosure env

We should stop at the data bindings env and reassign things there.

But I did find an issue along those lines

epidatasets::case_death_rate_archive %>% {.$DT <- copy(.$DT)[, e := 1]; .} %>% filter(e < 2)
#> Error in `filter()`:
#> ℹ In argument: `e < 2`.
#> Caused by error in `e < 2`:
#> ! comparison (<) is possible only for atomic and list types
#> Run `rlang::last_trace()` to see where the error occurred.

because I'm leaving around an e in the rlang wrapper env.

Also, I fell for a classic lazy eval + env issue

epidatasets::case_death_rate_archive %>% filter(case_rate_7d_av < 2)
#> Error in `filter()`:
#> ℹ In argument: `case_rate_7d_av < 2`.
#> Caused by error:
#> ! Using `death_rate_7d_av` in `filter.epi_archive` may produce unexpected results.
#> → See `?filter.epi_archive` details for how to proceed.
#> Run `rlang::last_trace()` to see where the error occurred.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... and found some others. Should be fixed now.

},
...,
.by = {{ .by }}
)
}
# We could try to re-infer the geo_type, e.g., when filtering from
# national+state to just state. However, we risk inference failures such as
# "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when
# working with non-USA data:
out_geo_type <- .data$geo_type
if (.data$time_type == "day") {
# We might be going from daily to weekly; re-infer:
out_time_type <- guess_time_type(out_tbl$time_value)
} else {
# We might be filtering weekly to a single time_value; avoid re-inferring to
# stay "week". Or in other cases, just skip inferring, as re-inferring is
# expected to match the input time_type:
out_time_type <- .data$time_type
}
# Even if they narrow down to just a single value of an other_keys column,
# it's probably still better (& simpler) to treat it as an other_keys column
# since it still exists in the result:
out_other_keys <- .data$other_keys
# `filter` makes no guarantees about not aliasing columns in its result when
# the filter condition is all TRUE, so don't setDT.
out_dtbl <- as.data.table(out_tbl, key = out_other_keys)
result <- new_epi_archive(
out_dtbl,
out_geo_type, out_time_type, out_other_keys,
# Assume version-related metadata unchanged; part of why we want to push
# back on filter expressions like `.data$version <= .env$as_of`:
.data$clobberable_versions_start, .data$versions_end
)
# Filtering down rows while keeping all (ukey) columns should preserve ukey
# uniqueness.
result
}
65 changes: 65 additions & 0 deletions man/filter.epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

105 changes: 105 additions & 0 deletions tests/testthat/test-methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,108 @@ test_that("group_vars works as expected", {
"geo_value"
)
})

test_that("filter.epi_archive works as expected", {
ea2 <- ea2_data %>%
as_epi_archive()

# Some basic output value checks:

expect_equal(
ea2 %>% filter(geo_value == "tn"),
new_epi_archive(
ea2$DT[FALSE],
ea2$geo_type, ea2$time_type, ea2$other_keys,
ea2$clobberable_versions_start, ea2$versions_end
)
)

expect_equal(
ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")),
new_epi_archive(
data.table::data.table(
geo_value = "ca", time_value = as.Date("2020-06-02"),
version = as.Date("2020-06-02") + 0:2, cases = 0:2
),
ea2$geo_type, ea2$time_type, ea2$other_keys,
ea2$clobberable_versions_start, ea2$versions_end
)
)

# Output geo_type and time_type behavior:

hrr_day_ea <- tibble(
geo_value = c(rep(1, 14), 100),
time_value = as.Date("2020-01-01") - 1 + c(1:14, 14),
version = time_value + 3,
value = 1:15
) %>%
as_epi_archive()

expect_equal(hrr_day_ea$geo_type, "hrr")
expect_equal(hrr_day_ea$time_type, "day")

hrr_week_ea <- hrr_day_ea %>%
filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L)

expect_equal(hrr_week_ea$geo_type, "hrr")
expect_equal(hrr_week_ea$time_type, "week")

hrr_one_week_ea <- hrr_week_ea %>%
filter(time_value == time_value[[1]])

expect_equal(hrr_one_week_ea$time_type, "week")

intcustom_day_ea <- hrr_day_ea
intcustom_day_ea$geo_type <- "custom"

intcustom_week_ea <- intcustom_day_ea %>%
filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L)

expect_equal(intcustom_week_ea$geo_type, "custom")
expect_equal(intcustom_week_ea$time_type, "week")

# Environment variables should be fine:
version <- as.Date("2020-06-02") + 1
expect_no_error(ea2 %>% filter(geo_value == "ca", .env$version <= time_value))

# Error-raising:
expect_error(
ea2 %>% filter(version == as.Date("2020-06-02")),
class = "epiprocess__filter_archive__used_version"
)
expect_error(
ea2 %>% filter(version <= as.Date("2020-06-02")),
class = "epiprocess__filter_archive__used_version"
)
expect_snapshot(
ea2 %>% filter(version <= as.Date("2020-06-02")),
error = TRUE, cnd_class = TRUE
)
expect_error(
ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
class = "epiprocess__filter_archive__used_measurement"
)
expect_snapshot(
ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
error = TRUE, cnd_class = TRUE
)
expect_error(
ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
class = "epiprocess__filter_archive__used_measurement"
)
expect_error(
ea2 %>% filter(cases >= median(cases), .by = geo_value),
class = "epiprocess__filter_archive__used_measurement"
)

# Escape hatch:
expect_equal(
ea2 %>%
filter(version <= time_value + as.difftime(1, units = "days"),
.format_aware = TRUE
) %>%
.$DT,
ea2$DT[version <= time_value + as.difftime(1, units = "days"), ]
)
})