cmu-delphi · brookslogan · Apr 16, 2025 · Apr 2, 2025 · Apr 3, 2025 · Apr 3, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ S3method(epix_slide,epi_archive)
 S3method(epix_slide,grouped_epi_archive)
 S3method(epix_truncate_versions_after,epi_archive)
 S3method(epix_truncate_versions_after,grouped_epi_archive)
+S3method(filter,epi_archive)
 S3method(group_by,epi_archive)
 S3method(group_by,epi_df)
 S3method(group_by,grouped_epi_archive)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,12 @@
 
 Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's.
 
+# epiprocess 0.12
+
+## New features
+
+- Added `dplyr::filter` implementation for `epi_archive`s.
+
 # epiprocess 0.11
 
 ## Breaking changes

diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -984,3 +984,136 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
   attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols
   data
 }
+
+
+
+#' [`dplyr::filter`] for `epi_archive`s
+#'
+#' @param .data an `epi_archive`
+#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
+#'   unless you use `.format_aware = TRUE`; see details.
+#' @param .by as in [`dplyr::filter`]
+#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
+#'   details.
+#'
+#' @details
+#'
+#' By default, using the `version` column or measurement columns is disabled as
+#' it's easy to get unexpected results. See if either [`epix_as_of`] or
+#' [`epix_slide`] works as an alternative. If they don't cover your use case,
+#' then you can set `.format_aware = TRUE` to enable usage of these columns, but
+#' be careful to:
+#' * Factor in that `.data$DT` may have been converted into a compact format
+#'   based on diffing consecutive versions, and the last version of each
+#'   observation in `.data$DT` will always be carried forward to future
+#'   `version`s`; see details of [`as_epi_archive`].
+#' * Set `clobberable_versions_start` and `versions_end` of the result
+#'   appropriately after the `filter` call. They will be initialized with the
+#'   same values as in `.data`.
+#'
+#' `dplyr::filter` also has an optional argument `.preserve`, which should not
+#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
+#' not currently support `dplyr::filter`.
+#'
+#' @examples
+#'
+#' # Filter to one location and a particular time range:
+#' archive_cases_dv_subset %>%
+#'   filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
+#'
+#' # Convert to weekly by taking the Saturday data for each week, so that
+#' # `case_rate_7d_av` represents a Sun--Sat average:
+#' archive_cases_dv_subset %>%
+#'   filter(as.POSIXlt(time_value)$wday == 6L)
+#'
+#' # Filtering involving the `version` column or measurement columns requires
+#' # extra care. See epix_as_of and epix_slide instead for some common
+#' # operations. One semi-common operation that ends up being fairly simple is
+#' # treating observations as finalized after some amount of time, and ignoring
+#' # any revisions that were made after that point:
+#' archive_cases_dv_subset %>%
+#'   filter(
+#'     version <= time_value + as.difftime(60, units = "days"),
+#'     .format_aware = TRUE
+#'   )
+#'
+#' @export
+filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) {
+  in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal")
+  if (.format_aware) {
+    out_tbl <- in_tbl %>%
+      filter(..., .by = {{ .by }})
+  } else {
+    measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data))
+    forbidden_colnames <- c("version", measurement_colnames)
+    out_tbl <- in_tbl %>%
+      filter(
+        # Add our own fake filter arg to the user's ..., to update the data mask
+        # to prevent `version` column usage.
+        {
+          # We should be evaluating inside the data mask. To disable both
+          # `version` and `.data$version` etc., we need to go to the ancestor
+          # environment containing the data mask's column bindings. This is
+          # likely just the parent env, but search to make sure, in a way akin
+          # to `<<-`:
+          e <- environment()
+          while (!identical(e, globalenv()) && !identical(e, emptyenv())) {
+            if ("version" %in% names(e)) {
+              # This is where the column bindings are. Replace the forbidden ones.
+              # They are expected to be active bindings, so directly
+              # assigning has issues; `rm` first.
+              rm(list = forbidden_colnames, envir = e)
+              delayedAssign("version", cli::cli_abort(c(
+                "Using `version` in `filter.epi_archive` may produce unexpected results.",
+                ">" = "See if `epix_as_of` or `epix_slide` would work instead.",
+                ">" = "If not, see `?filter.epi_archive` details for how to proceed."
+              ), class = "epiprocess__filter_archive__used_version"), assign.env = e)
+              for (measurement_colname in measurement_colnames) {
+                delayedAssign(measurement_colname, cli::cli_abort(c(
+                  "Using `{format_varname(measurement_colname)}`
+                 in `filter.epi_archive` may produce unexpected results.",
+                  ">" = "See `?filter.epi_archive` details for how to proceed."
+                ), class = "epiprocess__filter_archive__used_measurement"), assign.env = e)
+              }
+              break
+            }
+            e <- parent.env(e)
+          }
+          TRUE
+        },
+        ...,
+        .by = {{ .by }}
+      )
+  }
+  # We could try to re-infer the geo_type, e.g., when filtering from
+  # national+state to just state. However, we risk inference failures such as
+  # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when
+  # working with non-USA data:
+  out_geo_type <- .data$geo_type
+  if (.data$time_type == "day") {
+    # We might be going from daily to weekly; re-infer:
+    out_time_type <- guess_time_type(out_tbl$time_value)
+  } else {
+    # We might be filtering weekly to a single time_value; avoid re-inferring to
+    # stay "week". Or in other cases, just skip inferring, as re-inferring is
+    # expected to match the input time_type:
+    out_time_type <- .data$time_type
+  }
+  # Even if they narrow down to just a single value of an other_keys column,
+  # it's probably still better (& simpler) to treat it as an other_keys column
+  # since it still exists in the result:
+  out_other_keys <- .data$other_keys
+  # `filter` makes no guarantees about not aliasing columns in its result when
+  # the filter condition is all TRUE, so don't setDT.
+  out_dtbl <- as.data.table(out_tbl, key = out_other_keys)
+  result <- new_epi_archive(
+    out_dtbl,
+    out_geo_type, out_time_type, out_other_keys,
+    # Assume version-related metadata unchanged; part of why we want to push
+    # back on filter expressions like `.data$version <= .env$as_of`:
+    .data$clobberable_versions_start, .data$versions_end
+  )
+  # Filtering down rows while keeping all (ukey) columns should preserve ukey
+  # uniqueness.
+  result
+}
diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd
diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R
@@ -128,3 +128,108 @@ test_that("group_vars works as expected", {
     "geo_value"
   )
 })
+
+test_that("filter.epi_archive works as expected", {
+  ea2 <- ea2_data %>%
+    as_epi_archive()
+
+  # Some basic output value checks:
+
+  expect_equal(
+    ea2 %>% filter(geo_value == "tn"),
+    new_epi_archive(
+      ea2$DT[FALSE],
+      ea2$geo_type, ea2$time_type, ea2$other_keys,
+      ea2$clobberable_versions_start, ea2$versions_end
+    )
+  )
+
+  expect_equal(
+    ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")),
+    new_epi_archive(
+      data.table::data.table(
+        geo_value = "ca", time_value = as.Date("2020-06-02"),
+        version = as.Date("2020-06-02") + 0:2, cases = 0:2
+      ),
+      ea2$geo_type, ea2$time_type, ea2$other_keys,
+      ea2$clobberable_versions_start, ea2$versions_end
+    )
+  )
+
+  # Output geo_type and time_type behavior:
+
+  hrr_day_ea <- tibble(
+    geo_value = c(rep(1, 14), 100),
+    time_value = as.Date("2020-01-01") - 1 + c(1:14, 14),
+    version = time_value + 3,
+    value = 1:15
+  ) %>%
+    as_epi_archive()
+
+  expect_equal(hrr_day_ea$geo_type, "hrr")
+  expect_equal(hrr_day_ea$time_type, "day")
+
+  hrr_week_ea <- hrr_day_ea %>%
+    filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L)
+
+  expect_equal(hrr_week_ea$geo_type, "hrr")
+  expect_equal(hrr_week_ea$time_type, "week")
+
+  hrr_one_week_ea <- hrr_week_ea %>%
+    filter(time_value == time_value[[1]])
+
+  expect_equal(hrr_one_week_ea$time_type, "week")
+
+  intcustom_day_ea <- hrr_day_ea
+  intcustom_day_ea$geo_type <- "custom"
+
+  intcustom_week_ea <- intcustom_day_ea %>%
+    filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L)
+
+  expect_equal(intcustom_week_ea$geo_type, "custom")
+  expect_equal(intcustom_week_ea$time_type, "week")
+
+  # Environment variables should be fine:
+  version <- as.Date("2020-06-02") + 1
+  expect_no_error(ea2 %>% filter(geo_value == "ca", .env$version <= time_value))
+
+  # Error-raising:
+  expect_error(
+    ea2 %>% filter(version == as.Date("2020-06-02")),
+    class = "epiprocess__filter_archive__used_version"
+  )
+  expect_error(
+    ea2 %>% filter(version <= as.Date("2020-06-02")),
+    class = "epiprocess__filter_archive__used_version"
+  )
+  expect_snapshot(
+    ea2 %>% filter(version <= as.Date("2020-06-02")),
+    error = TRUE, cnd_class = TRUE
+  )
+  expect_error(
+    ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
+    class = "epiprocess__filter_archive__used_measurement"
+  )
+  expect_snapshot(
+    ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
+    error = TRUE, cnd_class = TRUE
+  )
+  expect_error(
+    ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L),
+    class = "epiprocess__filter_archive__used_measurement"
+  )
+  expect_error(
+    ea2 %>% filter(cases >= median(cases), .by = geo_value),
+    class = "epiprocess__filter_archive__used_measurement"
+  )
+
+  # Escape hatch:
+  expect_equal(
+    ea2 %>%
+      filter(version <= time_value + as.difftime(1, units = "days"),
+        .format_aware = TRUE
+      ) %>%
+      .$DT,
+    ea2$DT[version <= time_value + as.difftime(1, units = "days"), ]
+  )
+})