Skip to content

Commit 732e268

Browse files
committed
feat: add filter.epi_archive
1 parent f809f0e commit 732e268

File tree

4 files changed

+183
-0
lines changed

4 files changed

+183
-0
lines changed

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ S3method(epix_slide,epi_archive)
3131
S3method(epix_slide,grouped_epi_archive)
3232
S3method(epix_truncate_versions_after,epi_archive)
3333
S3method(epix_truncate_versions_after,grouped_epi_archive)
34+
S3method(filter,epi_archive)
3435
S3method(group_by,epi_archive)
3536
S3method(group_by,epi_df)
3637
S3method(group_by,grouped_epi_archive)

NEWS.md

+6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's.
44

5+
# epiprocess 0.12
6+
7+
## New features
8+
9+
- Added `dplyr::filter` implementation for `epi_archive`s.
10+
511
# epiprocess 0.11
612

713
## Breaking changes

R/methods-epi_archive.R

+115
Original file line numberDiff line numberDiff line change
@@ -984,3 +984,118 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
984984
attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols
985985
data
986986
}
987+
988+
989+
990+
#' [`dplyr::filter`] for `epi_archive`s
991+
#'
992+
#' @param .data an `epi_archive`
993+
#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
994+
#' unless you use `.format_aware = TRUE`; see details.
995+
#' @param .by as in [`dplyr::filter`]
996+
#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
997+
#' details.
998+
#'
999+
#' @details
1000+
#'
1001+
#' By default, using the `version` column is disabled as it's easy to
1002+
#' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`]
1003+
#' works as an alternative. If they don't cover your use case, then you can
1004+
#' set `.format_aware = TRUE` to enable usage of the `version` column, but be
1005+
#' careful to:
1006+
#' * Factor in that `.data$DT` may be using a "compact" format based on diffing
1007+
#' consecutive versions; see details of [`as_epi_archive`]
1008+
#' * Set `clobberable_versions_start` and `versions_end` of the result
1009+
#' appropriately after the `filter` call. They will be initialized with the
1010+
#' same values as in `.data`.
1011+
#'
1012+
#' `dplyr::filter` also has an optional argument `.preserve`, which should not
1013+
#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1014+
#' not currently support `dplyr::filter`.
1015+
#'
1016+
#' @examples
1017+
#'
1018+
#' # Filter to one location and a particular time range:
1019+
#' archive_cases_dv_subset %>%
1020+
#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1021+
#'
1022+
#' # Convert to weekly by taking the Saturday data for each week, so that
1023+
#' # `case_rate_7d_av` represents a Sun--Sat average:
1024+
#' archive_cases_dv_subset %>%
1025+
#' filter(as.POSIXlt(time_value)$wday == 6L)
1026+
#'
1027+
#' # Filtering involving versions requires extra care. See epix_as_of and
1028+
#' # epix_slide instead for some common operations. One semi-common operation
1029+
#' # that ends up being fairly simple is treating observations as finalized
1030+
#' # after some amount of time, and ignoring any revisions that were made after
1031+
#' # that point:
1032+
#' archive_cases_dv_subset %>%
1033+
#' filter(version <= time_value + as.difftime(60, units = "days"),
1034+
#' .format_aware = TRUE)
1035+
#'
1036+
#' @export
1037+
filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) {
1038+
in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal")
1039+
if (.format_aware) {
1040+
out_tbl <- in_tbl %>%
1041+
filter(..., .by = .by)
1042+
} else {
1043+
out_tbl <- in_tbl %>%
1044+
filter(
1045+
# Add our own fake filter arg to the user's ..., to update the data mask
1046+
# to prevent `version` column usage.
1047+
{
1048+
# We should be evaluating inside the data mask. To disable both
1049+
# `version` and `.data$version`, we need to go to the data mask's
1050+
# ------
1051+
e <- environment()
1052+
while (!identical(e, globalenv()) && !identical(e, emptyenv())) {
1053+
if ("version" %in% names(e)) {
1054+
# "version" is expected to be an active binding, and directly
1055+
# assigning over it has issues; explicitly `rm` first.
1056+
rm(list = "version", envir = e)
1057+
delayedAssign("version", cli::cli_abort(c(
1058+
"Using `version` in `filter` may produce unexpected results.",
1059+
">" = "See if `epix_as_of` or `epix_slide` would work instead.",
1060+
">" = "If not, see `?filter.epi_archive` details for how to proceed."
1061+
)), assign.env = e)
1062+
break
1063+
}
1064+
e <- parent.env(e)
1065+
}
1066+
TRUE
1067+
},
1068+
...,
1069+
.by = .by
1070+
)
1071+
}
1072+
out_geo_type <-
1073+
if (.data$geo_type == "custom") {
1074+
# We might be going from a multi-resolution to single-resolution archive;
1075+
# e.g. national+state -> state; try to re-infer:
1076+
guess_geo_type(out_tbl$geo_value)
1077+
} else {
1078+
# We risk less-understandable inference failures such as inferring "hhs"
1079+
# from selecting hrr 10 data; just use the old geo_type:
1080+
.data$geo_type
1081+
}
1082+
# We might be going from daily to weekly; re-infer:
1083+
out_time_type <- guess_time_type(out_tbl$time_value)
1084+
# Even if they narrow down to just a single value of an other_keys column,
1085+
# it's probably still better (& simpler) to treat it as an other_keys column
1086+
# since it still exists in the result:
1087+
out_other_keys <- .data$other_keys
1088+
# `filter` makes no guarantees about not aliasing columns in its result when
1089+
# the filter condition is all TRUE, so don't setDT.
1090+
out_dtbl <- as.data.table(out_tbl, key = out_other_keys)
1091+
result <- new_epi_archive(
1092+
out_dtbl,
1093+
out_geo_type, out_time_type, out_other_keys,
1094+
# Assume version-related metadata unchanged; part of why we want to push
1095+
# back on filter expressions like `.data$version <= .env$as_of`:
1096+
.data$clobberable_versions_start, .data$versions_end
1097+
)
1098+
# Filtering down rows while keeping all (ukey) columns should preserve ukey
1099+
# uniqueness.
1100+
result
1101+
}

man/filter.epi_archive.Rd

+61
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)