@@ -984,3 +984,118 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
984
984
attr(data , " epiprocess::col_modify_recorder_df::cols" ) <- cols
985
985
data
986
986
}
987
+
988
+
989
+
990
+ # ' [`dplyr::filter`] for `epi_archive`s
991
+ # '
992
+ # ' @param .data an `epi_archive`
993
+ # ' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
994
+ # ' unless you use `.format_aware = TRUE`; see details.
995
+ # ' @param .by as in [`dplyr::filter`]
996
+ # ' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
997
+ # ' details.
998
+ # '
999
+ # ' @details
1000
+ # '
1001
+ # ' By default, using the `version` column is disabled as it's easy to
1002
+ # ' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`]
1003
+ # ' works as an alternative. If they don't cover your use case, then you can
1004
+ # ' set `.format_aware = TRUE` to enable usage of the `version` column, but be
1005
+ # ' careful to:
1006
+ # ' * Factor in that `.data$DT` may be using a "compact" format based on diffing
1007
+ # ' consecutive versions; see details of [`as_epi_archive`]
1008
+ # ' * Set `clobberable_versions_start` and `versions_end` of the result
1009
+ # ' appropriately after the `filter` call. They will be initialized with the
1010
+ # ' same values as in `.data`.
1011
+ # '
1012
+ # ' `dplyr::filter` also has an optional argument `.preserve`, which should not
1013
+ # ' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1014
+ # ' not currently support `dplyr::filter`.
1015
+ # '
1016
+ # ' @examples
1017
+ # '
1018
+ # ' # Filter to one location and a particular time range:
1019
+ # ' archive_cases_dv_subset %>%
1020
+ # ' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1021
+ # '
1022
+ # ' # Convert to weekly by taking the Saturday data for each week, so that
1023
+ # ' # `case_rate_7d_av` represents a Sun--Sat average:
1024
+ # ' archive_cases_dv_subset %>%
1025
+ # ' filter(as.POSIXlt(time_value)$wday == 6L)
1026
+ # '
1027
+ # ' # Filtering involving versions requires extra care. See epix_as_of and
1028
+ # ' # epix_slide instead for some common operations. One semi-common operation
1029
+ # ' # that ends up being fairly simple is treating observations as finalized
1030
+ # ' # after some amount of time, and ignoring any revisions that were made after
1031
+ # ' # that point:
1032
+ # ' archive_cases_dv_subset %>%
1033
+ # ' filter(version <= time_value + as.difftime(60, units = "days"),
1034
+ # ' .format_aware = TRUE)
1035
+ # '
1036
+ # ' @export
1037
+ filter.epi_archive <- function (.data , ... , .by = NULL , .format_aware = FALSE ) {
1038
+ in_tbl <- tibble :: as_tibble(as.list(.data $ DT ), .name_repair = " minimal" )
1039
+ if (.format_aware ) {
1040
+ out_tbl <- in_tbl %> %
1041
+ filter(... , .by = .by )
1042
+ } else {
1043
+ out_tbl <- in_tbl %> %
1044
+ filter(
1045
+ # Add our own fake filter arg to the user's ..., to update the data mask
1046
+ # to prevent `version` column usage.
1047
+ {
1048
+ # We should be evaluating inside the data mask. To disable both
1049
+ # `version` and `.data$version`, we need to go to the data mask's
1050
+ # ------
1051
+ e <- environment()
1052
+ while (! identical(e , globalenv()) && ! identical(e , emptyenv())) {
1053
+ if (" version" %in% names(e )) {
1054
+ # "version" is expected to be an active binding, and directly
1055
+ # assigning over it has issues; explicitly `rm` first.
1056
+ rm(list = " version" , envir = e )
1057
+ delayedAssign(" version" , cli :: cli_abort(c(
1058
+ " Using `version` in `filter` may produce unexpected results." ,
1059
+ " >" = " See if `epix_as_of` or `epix_slide` would work instead." ,
1060
+ " >" = " If not, see `?filter.epi_archive` details for how to proceed."
1061
+ )), assign.env = e )
1062
+ break
1063
+ }
1064
+ e <- parent.env(e )
1065
+ }
1066
+ TRUE
1067
+ },
1068
+ ... ,
1069
+ .by = .by
1070
+ )
1071
+ }
1072
+ out_geo_type <-
1073
+ if (.data $ geo_type == " custom" ) {
1074
+ # We might be going from a multi-resolution to single-resolution archive;
1075
+ # e.g. national+state -> state; try to re-infer:
1076
+ guess_geo_type(out_tbl $ geo_value )
1077
+ } else {
1078
+ # We risk less-understandable inference failures such as inferring "hhs"
1079
+ # from selecting hrr 10 data; just use the old geo_type:
1080
+ .data $ geo_type
1081
+ }
1082
+ # We might be going from daily to weekly; re-infer:
1083
+ out_time_type <- guess_time_type(out_tbl $ time_value )
1084
+ # Even if they narrow down to just a single value of an other_keys column,
1085
+ # it's probably still better (& simpler) to treat it as an other_keys column
1086
+ # since it still exists in the result:
1087
+ out_other_keys <- .data $ other_keys
1088
+ # `filter` makes no guarantees about not aliasing columns in its result when
1089
+ # the filter condition is all TRUE, so don't setDT.
1090
+ out_dtbl <- as.data.table(out_tbl , key = out_other_keys )
1091
+ result <- new_epi_archive(
1092
+ out_dtbl ,
1093
+ out_geo_type , out_time_type , out_other_keys ,
1094
+ # Assume version-related metadata unchanged; part of why we want to push
1095
+ # back on filter expressions like `.data$version <= .env$as_of`:
1096
+ .data $ clobberable_versions_start , .data $ versions_end
1097
+ )
1098
+ # Filtering down rows while keeping all (ukey) columns should preserve ukey
1099
+ # uniqueness.
1100
+ result
1101
+ }
0 commit comments