Skip to content

Commit 7fd2119

Browse files
authored
Merge pull request #452 from cmu-delphi/ndefries/opt-slide-proper-tidyselect
Implement proper tidyselect for `epi_slide_opt`
2 parents 0ce39c5 + 64658c1 commit 7fd2119

File tree

9 files changed

+171
-167
lines changed

9 files changed

+171
-167
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.7.10
4+
Version: 0.7.11
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
1616
- Add new `epi_slide_opt` function to allow much faster rolling computations
1717
in some cases, using `data.table` and `slider` optimized rolling functions
1818
(#433).
19+
- Add tidyselect interface for `epi_slide_opt` and derivatives (#452).
1920
- regenerated the `jhu_csse_daily_subset` dataset with the latest versions of
2021
the data from the API
2122
- changed approach to versioning, see DEVELOPMENT.md for details

R/slide.R

Lines changed: 68 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@
8686
#' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`]
8787
#' @examples
8888
#' # slide a 7-day trailing average formula on cases
89-
#' # This and other simple sliding means are much faster to do using
90-
#' # the `epi_slide_mean` function instead.
89+
#' # Simple sliding means and sums are much faster to do using
90+
#' # the `epi_slide_mean` and `epi_slide_sum` functions instead.
9191
#' jhu_csse_daily_subset %>%
9292
#' group_by(geo_value) %>%
9393
#' epi_slide(cases_7dav = mean(cases), before = 6) %>%
@@ -377,7 +377,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
377377
#'
378378
#' @importFrom dplyr bind_rows mutate %>% arrange tibble select
379379
#' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg
380-
#' @importFrom purrr map map_lgl
380+
#' @importFrom tidyselect eval_select
381+
#' @importFrom purrr map map_lgl
381382
#' @importFrom data.table frollmean frollsum frollapply
382383
#' @importFrom lubridate as.period
383384
#' @importFrom checkmate assert_function
@@ -390,50 +391,50 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
390391
#' group_by(geo_value) %>%
391392
#' epi_slide_opt(
392393
#' cases,
393-
#' f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6
394+
#' f = data.table::frollmean, before = 6
394395
#' ) %>%
395-
#' # Remove a nonessential var. to ensure new col is printed
396-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
396+
#' # Remove a nonessential var. to ensure new col is printed, and rename new col
397+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
397398
#' ungroup()
398399
#'
399400
#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed
400401
#' # and accuracy, and to allow partially-missing windows.
401402
#' jhu_csse_daily_subset %>%
402403
#' group_by(geo_value) %>%
403-
#' epi_slide_opt(cases,
404-
#' f = data.table::frollmean,
405-
#' new_col_name = "cases_7dav", names_sep = NULL, before = 6,
404+
#' epi_slide_opt(
405+
#' cases,
406+
#' f = data.table::frollmean, before = 6,
406407
#' # `frollmean` options
407408
#' na.rm = TRUE, algo = "exact", hasNA = TRUE
408409
#' ) %>%
409-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
410+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
410411
#' ungroup()
411412
#'
412413
#' # slide a 7-day leading average
413414
#' jhu_csse_daily_subset %>%
414415
#' group_by(geo_value) %>%
415416
#' epi_slide_opt(
416417
#' cases,
417-
#' f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6
418+
#' f = slider::slide_mean, after = 6
418419
#' ) %>%
419420
#' # Remove a nonessential var. to ensure new col is printed
420-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
421+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
421422
#' ungroup()
422423
#'
423424
#' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum`
424425
#' jhu_csse_daily_subset %>%
425426
#' group_by(geo_value) %>%
426427
#' epi_slide_opt(
427428
#' cases,
428-
#' f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3
429+
#' f = data.table::frollsum, before = 3, after = 3
429430
#' ) %>%
430431
#' # Remove a nonessential var. to ensure new col is printed
431-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
432+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
432433
#' ungroup()
433434
epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
434435
time_step,
435-
new_col_name = "slide_value", as_list_col = NULL,
436-
names_sep = "_", all_rows = FALSE) {
436+
new_col_name = NULL, as_list_col = NULL,
437+
names_sep = NULL, all_rows = FALSE) {
437438
assert_class(x, "epi_df")
438439

439440
if (nrow(x) == 0L) {
@@ -443,15 +444,27 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
443444
"i" = "If this computation is occuring within an `epix_slide` call,
444445
check that `epix_slide` `ref_time_values` argument was set appropriately"
445446
),
446-
class = "epiprocess__epi_slide_mean__0_row_input",
447+
class = "epiprocess__epi_slide_opt__0_row_input",
447448
epiprocess__x = x
448449
)
449450
}
450451

451452
if (!is.null(as_list_col)) {
452453
cli_abort(
453-
"`as_list_col` is not supported for `epi_slide_mean`",
454-
class = "epiproces__epi_slide_mean__list_not_supported"
454+
"`as_list_col` is not supported for `epi_slide_[opt/mean/sum]`",
455+
class = "epiprocess__epi_slide_opt__list_not_supported"
456+
)
457+
}
458+
if (!is.null(new_col_name)) {
459+
cli_abort(
460+
"`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`",
461+
class = "epiprocess__epi_slide_opt__new_name_not_supported"
462+
)
463+
}
464+
if (!is.null(names_sep)) {
465+
cli_abort(
466+
"`names_sep` is not supported for `epi_slide_[opt/mean/sum]`",
467+
class = "epiprocess__epi_slide_opt__name_sep_not_supported"
455468
)
456469
}
457470

@@ -543,48 +556,16 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
543556
# `before` and `after` params.
544557
window_size <- before + after + 1L
545558

546-
col_names_quo <- enquo(col_names)
547-
col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo))
548-
if (startsWith(rlang::as_label(col_names_quo), "c(")) {
549-
# List or vector of col names. We need to drop the first element since it
550-
# will be either "c" (if built as a vector) or "list" (if built as a
551-
# list).
552-
col_names_chr <- col_names_chr[-1]
553-
} else if (startsWith(rlang::as_label(col_names_quo), "list(")) {
554-
cli_abort(
555-
"`col_names` must be a single tidy column name or a vector
556-
(`c()`) of tidy column names",
557-
class = "epiprocess__epi_slide_mean__col_names_in_list",
558-
epiprocess__col_names = col_names_chr
559-
)
560-
}
561-
# If single column name, do nothing.
562-
563-
if (is.null(names_sep)) {
564-
if (length(new_col_name) != length(col_names_chr)) {
565-
cli_abort(
566-
c(
567-
"`new_col_name` must be the same length as `col_names` when
568-
`names_sep` is NULL to avoid duplicate output column names."
569-
),
570-
class = "epiprocess__epi_slide_mean__col_names_length_mismatch",
571-
epiprocess__new_col_name = new_col_name,
572-
epiprocess__col_names = col_names_chr
573-
)
574-
}
575-
result_col_names <- new_col_name
576-
} else {
577-
if (length(new_col_name) != 1L && length(new_col_name) != length(col_names_chr)) {
578-
cli_abort(
579-
"`new_col_name` must be either length 1 or the same length as `col_names`.",
580-
class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one",
581-
epiprocess__new_col_name = new_col_name,
582-
epiprocess__col_names = col_names_chr
583-
)
584-
}
585-
result_col_names <- paste(new_col_name, col_names_chr, sep = names_sep)
586-
}
587-
559+
# The position of a given column can be differ between input `x` and
560+
# `.data_group` since the grouping step by default drops grouping columns.
561+
# To avoid rerunning `eval_select` for every `.data_group`, convert
562+
# positions of user-provided `col_names` into string column names. We avoid
563+
# using `names(pos)` directly for robustness and in case we later want to
564+
# allow users to rename fields via tidyselection.
565+
pos <- eval_select(rlang::enquo(col_names), data = x, allow_rename = FALSE)
566+
col_names_chr <- names(x)[pos]
567+
# Always rename results to "slide_value_<original column name>".
568+
result_col_names <- paste0("slide_value_", col_names_chr)
588569
slide_one_grp <- function(.data_group, .group_key, ...) {
589570
missing_times <- all_dates[!(all_dates %in% .data_group$time_value)]
590571

@@ -600,19 +581,19 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
600581
# If a group contains duplicate time values, `frollmean` will still only
601582
# use the last `k` obs. It isn't looking at dates, it just goes in row
602583
# order. So if the computation is aggregating across multiple obs for the
603-
# same date, `epi_slide_mean` will produce incorrect results; `epi_slide`
604-
# should be used instead.
584+
# same date, `epi_slide_opt` and derivates will produce incorrect
585+
# results; `epi_slide` should be used instead.
605586
if (anyDuplicated(.data_group$time_value) != 0L) {
606587
cli_abort(
607588
c(
608-
"group contains duplicate time values. Using `epi_slide_mean` on this
589+
"group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this
609590
group will result in incorrect results",
610591
"i" = "Please change the grouping structure of the input data so that
611592
each group has non-duplicate time values (e.g. `x %>% group_by(geo_value)
612-
%>% epi_slide_mean`)",
593+
%>% epi_slide_opt(f = frollmean)`)",
613594
"i" = "Use `epi_slide` to aggregate across groups"
614595
),
615-
class = "epiprocess__epi_slide_mean__duplicate_time_values",
596+
class = "epiprocess__epi_slide_opt__duplicate_time_values",
616597
epiprocess__data_group = .data_group,
617598
epiprocess__group_key = .group_key
618599
)
@@ -624,7 +605,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
624605
"i" = c("Input data may contain `time_values` closer together than the
625606
expected `time_step` size")
626607
),
627-
class = "epiprocess__epi_slide_mean__unexpected_row_number",
608+
class = "epiprocess__epi_slide_opt__unexpected_row_number",
628609
epiprocess__data_group = .data_group,
629610
epiprocess__group_key = .group_key
630611
)
@@ -669,7 +650,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
669650
}
670651

671652
if (!is_epi_df(result)) {
672-
# `all_rows`handling strip epi_df format and metadata.
653+
# `all_rows`handling strips epi_df format and metadata.
673654
# Restore them.
674655
result <- reclass(result, attributes(x)$metadata)
675656
}
@@ -700,50 +681,51 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values,
700681
#' # slide a 7-day trailing average formula on cases
701682
#' jhu_csse_daily_subset %>%
702683
#' group_by(geo_value) %>%
703-
#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>%
684+
#' epi_slide_mean(cases, before = 6) %>%
704685
#' # Remove a nonessential var. to ensure new col is printed
705-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
686+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
706687
#' ungroup()
707688
#'
708689
#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed
709690
#' # and accuracy, and to allow partially-missing windows.
710691
#' jhu_csse_daily_subset %>%
711692
#' group_by(geo_value) %>%
712-
#' epi_slide_mean(cases,
713-
#' new_col_name = "cases_7dav", names_sep = NULL, before = 6,
693+
#' epi_slide_mean(
694+
#' cases,
695+
#' before = 6,
714696
#' # `frollmean` options
715697
#' na.rm = TRUE, algo = "exact", hasNA = TRUE
716698
#' ) %>%
717-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
699+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
718700
#' ungroup()
719701
#'
720702
#' # slide a 7-day leading average
721703
#' jhu_csse_daily_subset %>%
722704
#' group_by(geo_value) %>%
723-
#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>%
705+
#' epi_slide_mean(cases, after = 6) %>%
724706
#' # Remove a nonessential var. to ensure new col is printed
725-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
707+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
726708
#' ungroup()
727709
#'
728710
#' # slide a 7-day centre-aligned average
729711
#' jhu_csse_daily_subset %>%
730712
#' group_by(geo_value) %>%
731-
#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>%
713+
#' epi_slide_mean(cases, before = 3, after = 3) %>%
732714
#' # Remove a nonessential var. to ensure new col is printed
733-
#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>%
715+
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>%
734716
#' ungroup()
735717
#'
736718
#' # slide a 14-day centre-aligned average
737719
#' jhu_csse_daily_subset %>%
738720
#' group_by(geo_value) %>%
739-
#' epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>%
721+
#' epi_slide_mean(cases, before = 6, after = 7) %>%
740722
#' # Remove a nonessential var. to ensure new col is printed
741-
#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>%
723+
#' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>%
742724
#' ungroup()
743725
epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values,
744726
time_step,
745-
new_col_name = "slide_value", as_list_col = NULL,
746-
names_sep = "_", all_rows = FALSE) {
727+
new_col_name = NULL, as_list_col = NULL,
728+
names_sep = NULL, all_rows = FALSE) {
747729
epi_slide_opt(
748730
x = x,
749731
col_names = {{ col_names }},
@@ -783,14 +765,14 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values,
783765
#' # slide a 7-day trailing sum formula on cases
784766
#' jhu_csse_daily_subset %>%
785767
#' group_by(geo_value) %>%
786-
#' epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) %>%
768+
#' epi_slide_sum(cases, before = 6) %>%
787769
#' # Remove a nonessential var. to ensure new col is printed
788-
#' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>%
770+
#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>%
789771
#' ungroup()
790772
epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values,
791773
time_step,
792-
new_col_name = "slide_value", as_list_col = NULL,
793-
names_sep = "_", all_rows = FALSE) {
774+
new_col_name = NULL, as_list_col = NULL,
775+
names_sep = NULL, all_rows = FALSE) {
794776
epi_slide_opt(
795777
x = x,
796778
col_names = {{ col_names }},
@@ -859,7 +841,7 @@ full_date_seq <- function(x, before, after, time_step) {
859841
"i" = c("The input data's `time_type` was probably `custom` or `day-time`.
860842
These require also passing a `time_step` function.")
861843
),
862-
class = "epiprocess__epi_slide_mean__unmappable_time_type",
844+
class = "epiprocess__full_date_seq__unmappable_time_type",
863845
epiprocess__time_type = ttype
864846
)
865847
}

man-roxygen/opt-slide-params.R

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1-
#' @param col_names A single tidyselection or a tidyselection vector of the
2-
#' names of one or more columns for which to calculate the rolling mean.
1+
#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column
2+
#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or
3+
#' [other tidy-select expression][tidyselect::language]. Variable names can
4+
#' be used as if they were positions in the data frame, so expressions like
5+
#' `x:y` can be used to select a range of variables. If you have the desired
6+
#' column names stored in a vector `vars`, use `col_names = all_of(vars)`.
7+
#'
8+
#' The tidy-selection renaming interface is not supported, and cannot be used
9+
#' to provide output column names; if you want to customize the output column
10+
#' names, use [`dplyr::rename`] after the slide.
311
#' @param as_list_col Not supported. Included to match `epi_slide` interface.
412
#' @param new_col_name Character vector indicating the name(s) of the new
513
#' column(s) that will contain the derivative values. Default

man/epi_slide.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)