diff --git a/R/archive.R b/R/archive.R
index bf5a38a9..58136ed5 100644
--- a/R/archive.R
+++ b/R/archive.R
@@ -584,14 +584,15 @@ epi_archive =
#' details.
#' @importFrom data.table key
#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms
- slide = function(f, ..., n, group_by, ref_time_values,
+ slide = function(f, ..., before, group_by,
+ ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
all_rows = FALSE) {
# If missing, then set ref time values to be everything; else make
# sure we intersect with observed time values
if (missing(ref_time_values)) {
- ref_time_values = unique(self$DT$time_value)
+ ref_time_values = unique(self$DT$time_values)
}
else {
ref_time_values = ref_time_values[ref_time_values %in%
@@ -599,8 +600,8 @@ epi_archive =
}
# If a custom time step is specified, then redefine units
- before_num = n-1
- if (!missing(time_step)) before_num = time_step(n-1)
+ before_num = before-1
+ if (!missing(time_step)) before_num = time_step(before-1)
# What to group by? If missing, set according to internal keys;
# otherwise, tidyselect.
@@ -619,7 +620,7 @@ epi_archive =
# Computation for one group, one time value
comp_one_grp = function(.data_group,
f, ...,
- time_value,
+ ref_time_value,
key_vars,
new_col) {
# Carry out the specified computation
@@ -665,21 +666,23 @@ epi_archive =
# Note that we've already recycled comp value to make size stable,
# so tibble() will just recycle time value appropriately
- return(tibble::tibble(time_value = time_value,
+ return(tibble::tibble(time_value = ref_time_value,
!!new_col := comp_value))
}
# If f is not missing, then just go ahead, slide by group
if (!missing(f)) {
+
if (rlang::is_formula(f)) f = rlang::as_function(f)
- x = purrr::map_dfr(ref_time_values, function(t) {
- self$as_of(t, min_time_value = t - before_num) %>%
+ x = purrr::map_dfr(ref_time_values, function(ref_time_value) {
+ self$as_of(ref_time_value,
+ min_time_value = ref_time_value - before_num) %>%
tibble::as_tibble() %>%
dplyr::group_by(!!!group_by) %>%
dplyr::group_modify(comp_one_grp,
f = f, ...,
- time_value = t,
+ ref_time_value = ref_time_value,
key_vars = key_vars,
new_col = new_col,
.keep = TRUE) %>%
@@ -701,13 +704,13 @@ epi_archive =
f = function(x, quo, ...) rlang::eval_tidy(quo, x)
new_col = sym(names(rlang::quos_auto_name(quos)))
- x = purrr::map_dfr(ref_time_values, function(t) {
- self$as_of(t, min_time_value = t - before_num) %>%
+ x = purrr::map_dfr(ref_time_values, function(ref_time_value) {
+ self$as_of(t, min_time_value = ref_time_value - before_num) %>%
tibble::as_tibble() %>%
dplyr::group_by(!!!group_by) %>%
dplyr::group_modify(comp_one_grp,
f = f, quo = quo,
- time_value = t,
+ ref_time_value = ref_time_value,
key_vars = key_vars,
new_col = new_col,
.keep = TRUE) %>%
diff --git a/R/growth_rate.R b/R/growth_rate.R
index d3ca9e31..9778a920 100644
--- a/R/growth_rate.R
+++ b/R/growth_rate.R
@@ -73,7 +73,7 @@
#' implicitly defined by the `x` variable; for example, if `x` is a vector of
#' `Date` objects, `h = 7`, and the reference point is January 7, then the
#' sliding window contains all data in between January 1 and 14 (matching the
-#' behavior of `epi_slide()` with `n = 2 * h` and `align = "center"`).
+#' behavior of `epi_slide()` with `before = h-1` and `after = h`).
#'
#' @section Additional Arguments:
#' For the global methods, "smooth_spline" and "trend_filter", additional
@@ -104,12 +104,12 @@
#' # COVID cases growth rate by state using default method relative change
#' jhu_csse_daily_subset %>%
#' group_by(geo_value) %>%
-#' mutate(cases_gr = growth_rate(x = time_value, y = cases))
+#' mutate(cases_gr = growth_rate(x = time_value, y = cases))
#'
#' # Log scale, degree 4 polynomial and 6-fold cross validation
#' jhu_csse_daily_subset %>%
#' group_by(geo_value) %>%
-#' mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6))
+#' mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6))
growth_rate = function(x = seq_along(y), y, x0 = x,
method = c("rel_change", "linear_reg",
diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
index 3387c935..a7874e2b 100644
--- a/R/methods-epi_archive.R
+++ b/R/methods-epi_archive.R
@@ -358,8 +358,10 @@ epix_merge = function(x, y,
#' @param ... Additional arguments to pass to the function or formula specified
#' via `f`. Alternatively, if `f` is missing, then the current argument is
#' interpreted as an expression for tidy evaluation.
-#' @param n Number of time steps to use in the running window. For example, if
-#' `n = 7`, and one time step is one day, then to produce a value on January 7
+#' @param before Number of time steps to use in the running window.
+#' For example, if
+#' `before = 7`, and one time step is one day, then to produce a
+#' value on January 7
#' we apply the given function or formula to data in between January 1 and
#' 7.
#' @param group_by The variable(s) to group by before slide computation. If
@@ -422,11 +424,11 @@ epix_merge = function(x, y,
#' Finally, this is simply a wrapper around the `slide()` method of the
#' `epi_archive` class, so if `x` is an `epi_archive` object, then:
#' ```
-#' epix_slide(x, new_var = comp(old_var), n = 120)
+#' epix_slide(x, new_var = comp(old_var), before = 120)
#' ```
#' is equivalent to:
#' ```
-#' x$slide(x, new_var = comp(old_var), n = 120)
+#' x$slide(new_var = comp(old_var), before = 120)
#' ```
#'
#' @importFrom rlang enquo
@@ -437,22 +439,23 @@ epix_merge = function(x, y,
#' # 0 day which has no results, for 2020-06-01
#' # 1 day, for 2020-06-02
#' # 2 days, for the rest of the results
-#' # never 3 days dur to data latency
+#' # never 3 days due to data latency
#'
-#' time_values <- seq(as.Date("2020-06-01"),
+#' versions <- seq(as.Date("2020-06-01"),
#' as.Date("2020-06-15"),
#' by = "1 day")
#' epix_slide(x = archive_cases_dv_subset,
#' f = ~ mean(.x$case_rate_7d_av),
-#' n = 3,
+#' before = 3,
#' group_by = geo_value,
-#' ref_time_values = time_values,
+#' ref_time_values = versions,
#' new_col_name = 'case_rate_3d_av')
-epix_slide = function(x, f, ..., n, group_by, ref_time_values,
- time_step, new_col_name = "slide_value",
+epix_slide = function(x, f, ..., before, group_by, ref_time_values,
+ time_step, new_col_name = "devslide_value",
as_list_col = FALSE, names_sep = "_", all_rows = FALSE) {
if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.")
- return(x$slide(f, ..., n = n,
+ return(x$slide(f, ...,
+ before = before,
group_by = {{group_by}},
ref_time_values = ref_time_values,
time_step = time_step,
diff --git a/R/outliers.R b/R/outliers.R
index 6cc2ffb1..d4eb40ee 100644
--- a/R/outliers.R
+++ b/R/outliers.R
@@ -179,7 +179,7 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21,
# Calculate lower and upper thresholds and replacement value
z = z %>%
- epi_slide(fitted = median(y), n = n, align = "center") %>%
+ epi_slide(fitted = median(y), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>%
dplyr::mutate(resid = y - fitted) %>%
roll_iqr(n = n,
detection_multiplier = detection_multiplier,
@@ -332,7 +332,7 @@ roll_iqr = function(z, n, detection_multiplier, min_radius,
if (typeof(z$y) == "integer") as_type = as.integer
else as_type = as.numeric
- epi_slide(z, roll_iqr = stats::IQR(resid), n = n, align = "center") %>%
+ epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>%
dplyr::mutate(
lower = pmax(min_lower,
fitted - pmax(min_radius, detection_multiplier * roll_iqr)),
diff --git a/R/slide.R b/R/slide.R
index 283ecd8c..0e3edf78 100644
--- a/R/slide.R
+++ b/R/slide.R
@@ -6,7 +6,8 @@
#'
#' @param x The `epi_df` object under consideration.
#' @param f Function or formula to slide over variables in `x`. To "slide" means
-#' to apply a function or formula over a running window of `n` time steps
+#' to apply a function or formula over a running window of `before`
+#' and `after` time steps
#' (where one time step is typically one day or one week; see details for more
#' explanation). If a function, `f` should take `x`, an `epi_df` with the same
#' names as the non-grouping columns, followed by `g` to refer to the one row
@@ -19,24 +20,28 @@
#' @param ... Additional arguments to pass to the function or formula specified
#' via `f`. Alternatively, if `f` is missing, then the current argument is
#' interpreted as an expression for tidy evaluation. See details.
-#' @param n Number of time steps to use in the running window. For example, if
-#' `n = 7`, one time step is one day, and the alignment is "right", then to
-#' produce a value on January 7 we apply the given function or formula to data
-#' in between January 1 and 7.
+#' @param before A nonnegative integer specifying the number of time steps
+#' before each of the `ref_time_values` to extract data from.
+#' This must be a vector of length 1.
+#' Set to 0 for a right-aligned/trailing sliding window, meaning
+#' that no
+#' `time_value` after the slide will be used for the sliding calculation.
+#' It is mandatory to specify a `before` value, unless `after` is specified
+#' as a non-zero value. In this case, `before` will be assumed to be 0, as it
+#' assumes the user wants to do a left-aligned/leading sliding window.
+#' However, this usage is discouraged and will thus produce a warning.
+#' @param after A nonnegative integer specifying the number of time steps after
+#' each of the `ref_time_values` to extract data from.
+#' This must be a vector of length 1. The default value for
+#' this is 0. Set to 0 for a left-aligned/leading sliding
+#' window, meaning that no
+#' `time_value` before the slide will be used for the sliding calculation.
+#' To specify this to be centrally aligned, set `before` and `after` to be
+#' the same.
#' @param ref_time_values Time values for sliding computations, meaning, each
#' element of this vector serves as the reference time point for one sliding
#' window. If missing, then this will be set to all unique time values in the
#' underlying data table, by default.
-#' @param align One of "right", "center", or "left", indicating the alignment of
-#' the sliding window relative to the reference time point. If the alignment
-#' is "center" and `n` is even, then one more time point will be used after
-#' the reference time point than before. Default is "right".
-#' @param before Positive integer less than `n`, specifying the number of time
-#' points to use in the sliding window strictly before the reference time
-#' point. For example, setting `before = n-1` would be the same as setting
-#' `align = "right"`. The `before` argument allows for more flexible
-#' specification of alignment than the `align` parameter, and if specified,
-#' overrides `align`.
#' @param time_step Optional function used to define the meaning of one time
#' step, which if specified, overrides the default choice based on the
#' `time_value` column. This function must take a positive integer and return
@@ -60,14 +65,15 @@
#' according to the `new_col_name` argument.
#'
#' @details To "slide" means to apply a function or formula over a running
-#' window of `n` time steps, where the unit (the meaning of one time step) is
+#' window of `before` time steps before and `after` time steps after,
+#' where the unit (the meaning of one time step) is
#' implicitly defined by the way the `time_value` column treats addition and
#' subtraction; for example, if the time values are coded as `Date` objects,
#' then one time step is one day, since `as.Date("2022-01-01") + 1` equals
#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly
#' using the `time_step` argument (which if specified would override the
-#' default choice based on `time_value` column). If less than `n` time steps
-#' are available at any given reference time value, then `epi_slide()` still
+#' default choice based on `time_value` column). If certain time steps
+#' are unavailable at any given reference time value, then `epi_slide()` still
#' attempts to perform the computation anyway (it does not require a complete
#' window). The issue of what to do with partial computations (those run on
#' incomplete windows) is therefore left up to the user, either through the
@@ -76,11 +82,11 @@
#' If `f` is missing, then an expression for tidy evaluation can be specified,
#' for example, as in:
#' ```
-#' epi_slide(x, cases_7dav = mean(cases), n = 7)
+#' epi_slide(x, cases_7dav = mean(cases), before = 6)
#' ```
#' which would be equivalent to:
#' ```
-#' epi_slide(x, function(x, ...) mean(x$cases), n = 7,
+#' epi_slide(x, function(x, ...) mean(x$cases), before = 6,
#' new_col_name = "cases_7dav")
#' ```
#' Thus, to be clear, when the computation is specified via an expression for
@@ -95,16 +101,21 @@
#' # slide a 7-day trailing average formula on cases
#' jhu_csse_daily_subset %>%
#' group_by(geo_value) %>%
-#' epi_slide(cases_7dav = mean(cases), n = 7,
-#' align = "right") %>%
+#' epi_slide(cases_7dav = mean(cases), before = 6) %>%
#' # rmv a nonessential var. to ensure new col is printed
#' dplyr::select(-death_rate_7d_av)
#'
-#' # slide a left-aligned 7-day average
+#' # slide a 7-day leading average
#' jhu_csse_daily_subset %>%
#' group_by(geo_value) %>%
-#' epi_slide(cases_7dav = mean(cases), n = 7,
-#' align = "left") %>%
+#' epi_slide(cases_7dav = mean(cases), before = 0, after = 6) %>%
+#' # rmv a nonessential var. to ensure new col is printed
+#' dplyr::select(-death_rate_7d_av)
+#'
+#' # slide a 7-day centre-aligned average
+#' jhu_csse_daily_subset %>%
+#' group_by(geo_value) %>%
+#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>%
#' # rmv a nonessential var. to ensure new col is printed
#' dplyr::select(-death_rate_7d_av)
#'
@@ -113,11 +124,12 @@
#' group_by(geo_value) %>%
#' epi_slide(a = data.frame(cases_2dav = mean(cases),
#' cases_2dma = mad(cases)),
-#' n = 2, as_list_col = TRUE)
-epi_slide = function(x, f, ..., n, ref_time_values,
- align = c("right", "center", "left"), before, time_step,
+#' before = 1, as_list_col = TRUE)
+epi_slide = function(x, f, ..., before, after = 0, ref_time_values,
+ time_step,
new_col_name = "slide_value", as_list_col = FALSE,
names_sep = "_", all_rows = FALSE) {
+
# Check we have an `epi_df` object
if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.")
@@ -133,44 +145,45 @@ epi_slide = function(x, f, ..., n, ref_time_values,
ref_time_values = ref_time_values[ref_time_values %in%
unique(x$time_value)]
}
-
- # If before is missing, then use align to set up alignment
+
+ # We must ensure that both before and after are of length 1
+ if (length(after) != 1L || (!missing(before) && length(before) != 1L)) {
+ Abort("`before` and `after` must be vectors of length 1.")
+ }
+
+ # Before cannot be missing if after is set to 0. If after is set to a nonzero
+ # number, then before must be set to 0
if (missing(before)) {
- align = match.arg(align)
- if (align == "right") {
- before_num = n-1
- after_num = 0
- }
- else if (align == "center") {
- before_num = floor((n-1)/2)
- after_num = ceiling((n-1)/2)
- }
- else {
- before_num = 0
- after_num = n-1
+ if (after == 0) {
+ Abort("`before` cannot be missing when `after` is set to 0.")
+ } else {
+ Warn("`before` missing, `after` nonzero; assuming that left-aligned/leading window is desired and setting `before` = 0.")
+ before = 0
}
}
+ if (!(is.numeric(before) && is.numeric(after))||
+ floor(before) < ceiling(before) ||
+ floor(after) < ceiling(after)) {
+ Abort("`before` and `after` must be integers.")
+ }
+
+
# Otherwise set up alignment based on passed before value
- else {
- if (before < 0 || before > n-1) {
- Abort("`before` must be in between 0 and n-1`.")
- }
-
- before_num = before
- after_num = n-1-before
+ if (before < 0 || after < 0) {
+ Abort("`before` and `after` must be at least 0.")
}
# If a custom time step is specified, then redefine units
if (!missing(time_step)) {
- before_num = time_step(before_num)
- after_num = time_step(after_num)
+ before = time_step(before)
+ after = time_step(after)
}
# Now set up starts and stops for sliding/hopping
time_range = range(unique(x$time_value))
- starts = in_range(ref_time_values - before_num, time_range)
- stops = in_range(ref_time_values + after_num, time_range)
+ starts = in_range(ref_time_values - before, time_range)
+ stops = in_range(ref_time_values + after, time_range)
if( length(starts) == 0 || length(stops) == 0 ) {
Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd
index d13ba4d6..a98798cc 100644
--- a/man/as_epi_archive.Rd
+++ b/man/as_epi_archive.Rd
@@ -91,11 +91,15 @@ examples.
}
\details{
This simply a wrapper around the \code{new()} method of the \code{epi_archive}
-class, so for example:\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day")
-}
+class, so for example:
-would be equivalent to:\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day")
-}
+\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day")
+}\if{html}{\out{
}}
+
+would be equivalent to:
+
+\if{html}{\out{}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day")
+}\if{html}{\out{
}}
}
\examples{
# Simple ex. with necessary keys
diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd
index 6d7592e4..851aed7e 100644
--- a/man/as_epi_df.Rd
+++ b/man/as_epi_df.Rd
@@ -52,9 +52,9 @@ examples.
}
\section{Methods (by class)}{
\itemize{
-\item \code{epi_df}: Simply returns the \code{epi_df} object unchanged.
+\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged.
-\item \code{tbl_df}: The input tibble \code{x} must contain the columns
+\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns
\code{geo_value} and \code{time_value}. All other columns will be preserved as is,
and treated as measured variables. If \code{as_of} is missing, then the function
will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x}
@@ -62,14 +62,14 @@ will try to guess it from an \code{as_of}, \code{issue}, or \code{version} colum
(stored in its attributes); if this fails, then the current day-time will
be used.
-\item \code{data.frame}: Works analogously to \code{as_epi_df.tbl_df()}.
+\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}.
-\item \code{tbl_ts}: Works analogously to \code{as_epi_df.tbl_df()}, except that
+\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that
the \code{tbl_ts} class is dropped, and any key variables (other than
"geo_value") are added to the metadata of the returned object, under the
\code{other_keys} field.
-}}
+}}
\examples{
# Convert a `tsibble` that has county code as an extra key
# Notice that county code should be a character string to preserve any leading zeroes
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
index 0b198eab..998ade9e 100644
--- a/man/epi_archive.Rd
+++ b/man/epi_archive.Rd
@@ -114,18 +114,18 @@ toy_epi_archive
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{epi_archive$new()}}
-\item \href{#method-print}{\code{epi_archive$print()}}
-\item \href{#method-as_of}{\code{epi_archive$as_of()}}
-\item \href{#method-fill_through_version}{\code{epi_archive$fill_through_version()}}
-\item \href{#method-merge}{\code{epi_archive$merge()}}
-\item \href{#method-slide}{\code{epi_archive$slide()}}
-\item \href{#method-clone}{\code{epi_archive$clone()}}
+\item \href{#method-epi_archive-new}{\code{epi_archive$new()}}
+\item \href{#method-epi_archive-print}{\code{epi_archive$print()}}
+\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}}
+\item \href{#method-epi_archive-fill_through_version}{\code{epi_archive$fill_through_version()}}
+\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}}
+\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}}
+\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}}
}
}
\if{html}{\out{}}\preformatted{epi_archive$slide(
f,
...,
- n,
+ before,
group_by,
ref_time_values,
time_step,
@@ -290,8 +290,8 @@ details.
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd
index c64bbce1..1e4c77af 100644
--- a/man/epi_slide.Rd
+++ b/man/epi_slide.Rd
@@ -8,10 +8,9 @@ epi_slide(
x,
f,
...,
- n,
- ref_time_values,
- align = c("right", "center", "left"),
before,
+ after = 0,
+ ref_time_values,
time_step,
new_col_name = "slide_value",
as_list_col = FALSE,
@@ -23,7 +22,8 @@ epi_slide(
\item{x}{The \code{epi_df} object under consideration.}
\item{f}{Function or formula to slide over variables in \code{x}. To "slide" means
-to apply a function or formula over a running window of \code{n} time steps
+to apply a function or formula over a running window of \code{before}
+and \code{after} time steps
(where one time step is typically one day or one week; see details for more
explanation). If a function, \code{f} should take \code{x}, an \code{epi_df} with the same
names as the non-grouping columns, followed by \code{g} to refer to the one row
@@ -38,28 +38,31 @@ to the groupings that would be described by \code{g} if \code{f} was a function.
via \code{f}. Alternatively, if \code{f} is missing, then the current argument is
interpreted as an expression for tidy evaluation. See details.}
-\item{n}{Number of time steps to use in the running window. For example, if
-\code{n = 7}, one time step is one day, and the alignment is "right", then to
-produce a value on January 7 we apply the given function or formula to data
-in between January 1 and 7.}
+\item{before}{A nonnegative integer specifying the number of time steps
+before each of the \code{ref_time_values} to extract data from.
+This must be a vector of length 1.
+Set to 0 for a right-aligned/trailing sliding window, meaning
+that no
+\code{time_value} after the slide will be used for the sliding calculation.
+It is mandatory to specify a \code{before} value, unless \code{after} is specified
+as a non-zero value. In this case, \code{before} will be assumed to be 0, as it
+assumes the user wants to do a left-aligned/leading sliding window.
+However, this usage is discouraged and will thus produce a warning.}
+
+\item{after}{A nonnegative integer specifying the number of time steps after
+each of the \code{ref_time_values} to extract data from.
+This must be a vector of length 1. The default value for
+this is 0. Set to 0 for a left-aligned/leading sliding
+window, meaning that no
+\code{time_value} before the slide will be used for the sliding calculation.
+To specify this to be centrally aligned, set \code{before} and \code{after} to be
+the same.}
\item{ref_time_values}{Time values for sliding computations, meaning, each
element of this vector serves as the reference time point for one sliding
window. If missing, then this will be set to all unique time values in the
underlying data table, by default.}
-\item{align}{One of "right", "center", or "left", indicating the alignment of
-the sliding window relative to the reference time point. If the alignment
-is "center" and \code{n} is even, then one more time point will be used after
-the reference time point than before. Default is "right".}
-
-\item{before}{Positive integer less than \code{n}, specifying the number of time
-points to use in the sliding window strictly before the reference time
-point. For example, setting \code{before = n-1} would be the same as setting
-\code{align = "right"}. The \code{before} argument allows for more flexible
-specification of alignment than the \code{align} parameter, and if specified,
-overrides \code{align}.}
-
\item{time_step}{Optional function used to define the meaning of one time
step, which if specified, overrides the default choice based on the
\code{time_value} column. This function must take a positive integer and return
@@ -93,26 +96,31 @@ examples.
}
\details{
To "slide" means to apply a function or formula over a running
-window of \code{n} time steps, where the unit (the meaning of one time step) is
+window of \code{before} time steps before and \code{after} time steps after,
+where the unit (the meaning of one time step) is
implicitly defined by the way the \code{time_value} column treats addition and
subtraction; for example, if the time values are coded as \code{Date} objects,
then one time step is one day, since \code{as.Date("2022-01-01") + 1} equals
\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly
using the \code{time_step} argument (which if specified would override the
-default choice based on \code{time_value} column). If less than \code{n} time steps
-are available at any given reference time value, then \code{epi_slide()} still
+default choice based on \code{time_value} column). If certain time steps
+are unavailable at any given reference time value, then \code{epi_slide()} still
attempts to perform the computation anyway (it does not require a complete
window). The issue of what to do with partial computations (those run on
incomplete windows) is therefore left up to the user, either through the
specified function or formula \code{f}, or through post-processing.
If \code{f} is missing, then an expression for tidy evaluation can be specified,
-for example, as in:\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7)
-}
+for example, as in:
-which would be equivalent to:\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7,
+\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), before = 6)
+}\if{html}{\out{
}}
+
+which would be equivalent to:
+
+\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, ...) mean(x$cases), before = 6,
new_col_name = "cases_7dav")
-}
+}\if{html}{\out{
}}
Thus, to be clear, when the computation is specified via an expression for
tidy evaluation (first example, above), then the name for the new column is
@@ -123,16 +131,21 @@ through the \code{new_col_name} argument.
# slide a 7-day trailing average formula on cases
jhu_csse_daily_subset \%>\%
group_by(geo_value) \%>\%
- epi_slide(cases_7dav = mean(cases), n = 7,
- align = "right") \%>\%
+ epi_slide(cases_7dav = mean(cases), before = 6) \%>\%
# rmv a nonessential var. to ensure new col is printed
dplyr::select(-death_rate_7d_av)
- # slide a left-aligned 7-day average
+ # slide a 7-day leading average
+ jhu_csse_daily_subset \%>\%
+ group_by(geo_value) \%>\%
+ epi_slide(cases_7dav = mean(cases), before = 0, after = 6) \%>\%
+ # rmv a nonessential var. to ensure new col is printed
+ dplyr::select(-death_rate_7d_av)
+
+ # slide a 7-day centre-aligned average
jhu_csse_daily_subset \%>\%
group_by(geo_value) \%>\%
- epi_slide(cases_7dav = mean(cases), n = 7,
- align = "left") \%>\%
+ epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\%
# rmv a nonessential var. to ensure new col is printed
dplyr::select(-death_rate_7d_av)
@@ -141,5 +154,5 @@ through the \code{new_col_name} argument.
group_by(geo_value) \%>\%
epi_slide(a = data.frame(cases_2dav = mean(cases),
cases_2dma = mad(cases)),
- n = 2, as_list_col = TRUE)
+ before = 1, as_list_col = TRUE)
}
diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd
index 6dc72a44..4053cd28 100644
--- a/man/epix_as_of.Rd
+++ b/man/epix_as_of.Rd
@@ -29,11 +29,15 @@ examples.
}
\details{
This is simply a wrapper around the \code{as_of()} method of the
-\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_as_of(x, max_version = v)
-}
+\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:
-is equivalent to:\preformatted{x$as_of(max_version = v)
-}
+\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v)
+}\if{html}{\out{
}}
+
+is equivalent to:
+
+\if{html}{\out{
}}\preformatted{x$as_of(max_version = v)
+}\if{html}{\out{
}}
}
\examples{
# warning message of data latency shown
diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd
index 2acae1a1..cd937fcf 100644
--- a/man/epix_slide.Rd
+++ b/man/epix_slide.Rd
@@ -8,11 +8,11 @@ epix_slide(
x,
f,
...,
- n,
+ before,
group_by,
ref_time_values,
time_step,
- new_col_name = "slide_value",
+ new_col_name = "devslide_value",
as_list_col = FALSE,
names_sep = "_",
all_rows = FALSE
@@ -34,8 +34,10 @@ sliding window of \code{n} time steps.}
via \code{f}. Alternatively, if \code{f} is missing, then the current argument is
interpreted as an expression for tidy evaluation.}
-\item{n}{Number of time steps to use in the running window. For example, if
-\code{n = 7}, and one time step is one day, then to produce a value on January 7
+\item{before}{Number of time steps to use in the running window.
+For example, if
+\code{before = 7}, and one time step is one day, then to produce a
+value on January 7
we apply the given function or formula to data in between January 1 and
7.}
@@ -115,11 +117,15 @@ should never be used in place of \code{epi_slide()}, and only used when
version-aware sliding is necessary (as it its purpose).
Finally, this is simply a wrapper around the \code{slide()} method of the
-\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_slide(x, new_var = comp(old_var), n = 120)
-}
+\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:
-is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120)
-}
+\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 120)
+}\if{html}{\out{
}}
+
+is equivalent to:
+
+\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 120)
+}\if{html}{\out{
}}
}
\examples{
# these dates are reference time points for the 3 day average sliding window
@@ -127,15 +133,15 @@ is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120)
# 0 day which has no results, for 2020-06-01
# 1 day, for 2020-06-02
# 2 days, for the rest of the results
-# never 3 days dur to data latency
+# never 3 days due to data latency
-time_values <- seq(as.Date("2020-06-01"),
+versions <- seq(as.Date("2020-06-01"),
as.Date("2020-06-15"),
by = "1 day")
epix_slide(x = archive_cases_dv_subset,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = geo_value,
- ref_time_values = time_values,
+ ref_time_values = versions,
new_col_name = 'case_rate_3d_av')
}
diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd
index 173eff43..890d1968 100644
--- a/man/growth_rate.Rd
+++ b/man/growth_rate.Rd
@@ -105,7 +105,7 @@ reference point is at most \code{h}. Note that the unit for this distance is
implicitly defined by the \code{x} variable; for example, if \code{x} is a vector of
\code{Date} objects, \code{h = 7}, and the reference point is January 7, then the
sliding window contains all data in between January 1 and 14 (matching the
-behavior of \code{epi_slide()} with \code{n = 2 * h} and \code{align = "center"}).
+behavior of \code{epi_slide()} with \code{before = h-1} and \code{after = h}).
}
\section{Additional Arguments}{
@@ -138,10 +138,10 @@ user.
# COVID cases growth rate by state using default method relative change
jhu_csse_daily_subset \%>\%
group_by(geo_value) \%>\%
- mutate(cases_gr = growth_rate(x = time_value, y = cases))
+ mutate(cases_gr = growth_rate(x = time_value, y = cases))
# Log scale, degree 4 polynomial and 6-fold cross validation
jhu_csse_daily_subset \%>\%
group_by(geo_value) \%>\%
- mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6))
+ mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6))
}
diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R
index f363ae34..c9e04fc3 100644
--- a/tests/testthat/test-epi_slide.R
+++ b/tests/testthat/test-epi_slide.R
@@ -1,34 +1,80 @@
## Create an epi. df and a function to test epi_slide with
-edf = dplyr::bind_rows(
+grouped = dplyr::bind_rows(
dplyr::tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + 1:200, value=1:200),
dplyr::tibble(geo_value = "al", time_value=as.Date("2020-01-01") + 1:5, value=-(1:5))
) %>%
- as_epi_df()
+ as_epi_df() %>%
+ group_by(geo_value)
f = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value))
-## --- These cases generate the error: ---
-test_that("`ref_time_values` + `align` that result in no slide data, generate the error", {
- expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")),
+## --- These cases generate errors (or not): ---
+test_that("`before` and `after` are both vectors of length 1", {
+ expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values=as.Date("2020-01-01") + 3),
+ "`before` and `after` must be vectors of length 1.")
+ expect_error(epi_slide(grouped, f, before = 1, after = c(0,1), ref_time_values=as.Date("2020-01-01") + 3),
+ "`before` and `after` must be vectors of length 1.")
+})
+
+test_that("`after` must be defined as a non-zero integer if `before` is missing", {
+ expect_error(epi_slide(grouped, f, ref_time_values=as.Date("2020-01-01")),
+ "`before` cannot be missing when `after` is set to 0.")
+ expect_error(epi_slide(grouped, f, after = 0L, ref_time_values=as.Date("2020-01-01")),
+ "`before` cannot be missing when `after` is set to 0.")
+ expect_error(epi_slide(grouped, f, before = 0L, ref_time_values=as.Date("2020-01-01")+1L),
+ NA)
+})
+
+test_that("Both `before` and `after` must be nonnegative integers",{
+ expect_error(epi_slide(grouped, f, before = -1L, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be at least 0.")
+ expect_error(epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be at least 0.")
+ expect_error(epi_slide(grouped, f, before = "a", ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ expect_error(epi_slide(grouped, f, before = NA, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ expect_error(epi_slide(grouped, f, before = 1L, after = NA, ref_time_values=as.Date("2020-01-01")+2L),
+ "`before` and `after` must be integers.")
+ # The before and after values can be numerics that are integerish
+ expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values=as.Date("2020-01-01")+2L),NA)
+})
+
+test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", {
+ expect_error(epi_slide(grouped, f, before=2L, ref_time_values=as.Date("2020-01-01")),
"starting and/or stopping times for sliding are out of bounds") # before the first, no data in the slide windows
- expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+207L),
+ expect_error(epi_slide(grouped, f, before=2L, ref_time_values=as.Date("2020-01-01")+207L),
"starting and/or stopping times for sliding are out of bounds") # beyond the last, no data in window
})
-test_that("`ref_time_values` + `align` that have some slide data, but generate the error due to ref. time being out of time range", {
- expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01"), align="left"),
+test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range", {
+ expect_error(epi_slide(grouped, f, before=0L, after=2L, ref_time_values=as.Date("2020-01-01")),
"starting and/or stopping times for sliding are out of bounds") # before the first, but we'd expect there to be data in the window
- expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+201L),
+ expect_error(epi_slide(grouped, f, before=2L, ref_time_values=as.Date("2020-01-01")+201L),
"starting and/or stopping times for sliding are out of bounds") # beyond the last, but still with data in window
})
+## --- These cases generate warnings (or not): ---
+test_that("Warn user against having a blank `before`",{
+ expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values=as.Date("2020-01-01")+1L),
+ "`before` missing, `after` nonzero; assuming that left-aligned/leading\nwindow is desired and setting `before` = 0.")
+ expect_warning(epi_slide(grouped, f, before = 0L, after = 1L,
+ ref_time_values=as.Date("2020-01-01")+1L), NA)
+})
+
## --- These cases doesn't generate the error: ---
test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", {
- expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+200L) %>%
+ expect_identical(epi_slide(grouped, f, before=2L, ref_time_values=as.Date("2020-01-01")+200L) %>%
dplyr::select("geo_value","slide_value_value"),
dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group
- expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-04")) %>%
+ expect_identical(epi_slide(grouped, f, before=2L, ref_time_values=as.Date("2020-01-04")) %>%
dplyr::select("geo_value","slide_value_value"),
dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group
})
\ No newline at end of file
diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R
new file mode 100644
index 00000000..c54ad73c
--- /dev/null
+++ b/tests/testthat/test-epix_slide.R
@@ -0,0 +1,42 @@
+library(dplyr)
+library(rlang)
+
+test_that("epix_slide only works on an epi_archive",{
+ expect_error(epix_slide(data.frame(x=1)))
+})
+
+test_that("epix_slide works as intended",{
+ x <- tibble::tribble(~version, ~time_value,
+ 5, c(1:2,4),
+ 6, c(1:2,4:5),
+ 7, 2:6) %>%
+ tidyr::unnest(time_value)
+
+ xx <- bind_cols(geo_value = rep("x",12),
+ arrange(x,time_value,version),
+ binary = 2^(1:12)) %>%
+ as_epi_archive()
+
+ time_values <- 2:5
+
+ xx1 <- epix_slide(x = xx,
+ f = ~ sum(.xx$binary),
+ before = 3,
+ group_by = geo_value,
+ ref_time_values = versions,
+ new_col_name = "sum_binary")
+
+ xx2 <- tibble(geo_value = rep("x",5),
+ time_value = as.Date("2020-06-01") + 1:5,
+ sum_binary = c(3))
+
+ expect_identical(xx1,xx2) # *
+
+ xx3 <- xx$slide(f = ~ sum(.xx$binary),
+ before = 3,
+ group_by = "geo_value",
+ ref_time_values = time_values,
+ new_col_name = 'sum_binary')
+
+ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical
+})
diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R
index d0434f59..306dd0c3 100644
--- a/tests/testthat/test-methods-epi_archive.R
+++ b/tests/testthat/test-methods-epi_archive.R
@@ -71,13 +71,13 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
compactify = TRUE)
reference_by_modulus = epix_slide(x = ea,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = modulus,
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av')
reference_by_both = epix_slide(x = ea,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = c(geo_value, modulus),
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av')
@@ -85,7 +85,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
ea$slide(
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = modulus,
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'
@@ -96,7 +96,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
epix_slide(x = ea,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = "modulus",
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'),
@@ -105,7 +105,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
ea$slide(
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = "modulus",
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'
@@ -121,7 +121,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
epix_slide(x = ea,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = tidyselect::all_of(my_group_by),
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'),
@@ -130,7 +130,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
ea$slide(
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
group_by = tidyselect::all_of(my_group_by),
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'
@@ -141,7 +141,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
epix_slide(x = ea,
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'),
reference_by_both
@@ -149,7 +149,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss
expect_identical(
ea$slide(
f = ~ mean(.x$case_rate_7d_av),
- n = 3,
+ before = 3,
ref_time_values = time_values,
new_col_name = 'case_rate_3d_av'
),
diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd
index 5514eaaa..285a8e07 100644
--- a/vignettes/advanced.Rmd
+++ b/vignettes/advanced.Rmd
@@ -15,7 +15,7 @@ ensure the result of a slide operation is *size stable*, meaning, it will return
something whose length is the same as the number of appearances of reference
time values for the slide computation in the given data frame/table (this
defaults to all time values, but can be some given subset when `ref_time_values`
-is specified).
+or `ref_time_values` is specified, respectively).
The output of a slide computation should either be an atomic value/vector, or a
data frame. This data frame can have multiple columns, multiple rows, or both.
@@ -46,27 +46,27 @@ df <- tibble(
# 2-day trailing average, per geo value
df %>%
group_by(geo_value) %>%
- epi_slide(x_2dav = mean(x), n = 2)
+ epi_slide(x_2dav = mean(x), before = 1)
# 2-day trailing average, marginally
df %>%
- epi_slide(x_2dav = mean(x), n = 2)
+ epi_slide(x_2dav = mean(x), before = 1)
```
```{r, include = FALSE}
# More checks (not included)
df %>%
- epi_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02"))
+ epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02"))
df %>%
mutate(version = time_value) %>%
as_epi_archive() %>%
- epix_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02"))
+ epix_slide(x_2dav = mean(x), before = 2, ref_time_values = as.Date("2020-06-02"))
df %>%
mutate(version = time_value) %>%
as_epi_archive() %>%
- epix_slide(~ mean(.x$x), n = 2, ref_time_values = as.Date("2020-06-02"))
+ epix_slide(~ mean(.x$x), before = 2, ref_time_values = as.Date("2020-06-02"))
```
When the slide computation returns an atomic vector (rather than a single value)
@@ -76,7 +76,7 @@ same result as the last one.
```{r}
df %>%
- epi_slide(y_2dav = rep(mean(x), 3), n = 2)
+ epi_slide(y_2dav = rep(mean(x), 3), before = 1)
```
However, if the output is an atomic vector (rather than a single value) and it
@@ -85,7 +85,7 @@ are trying to return 2 things for 3 states.
```{r, error = TRUE}
df %>%
- epi_slide(x_2dav = rep(mean(x), 2), n = 2)
+ epi_slide(x_2dav = rep(mean(x), 2), before = 1)
```
## Multi-column outputs
@@ -101,7 +101,7 @@ object returned by `epi_slide()` has a list column containing the slide values.
df2 <- df %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
- n = 2, as_list_col = TRUE)
+ before = 1, as_list_col = TRUE)
class(df2$a)
length(df2$a)
@@ -119,7 +119,7 @@ slide computation (here `x_2dav` and `x_2dma`) separated by "_".
df %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
- n = 2, as_list_col = FALSE)
+ before = 1, as_list_col = FALSE)
```
We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop
@@ -129,7 +129,7 @@ the prefix associated with list column name, in naming the unnested columns.
df %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
- n = 2, as_list_col = FALSE, names_sep = NULL)
+ before = 1, as_list_col = FALSE, names_sep = NULL)
```
Furthermore, `epi_slide()` will recycle the single row data frame as needed in
@@ -138,7 +138,7 @@ order to make the result size stable, just like the case for atomic values.
```{r}
df %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
- n = 2, as_list_col = FALSE, names_sep = NULL)
+ before = 1, as_list_col = FALSE, names_sep = NULL)
```
```{r, include = FALSE}
@@ -146,14 +146,14 @@ df %>%
df %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
ref_time_values = as.Date("2020-06-02"),
- n = 2, as_list_col = FALSE, names_sep = NULL)
+ before = 1, as_list_col = FALSE, names_sep = NULL)
df %>%
mutate(version = time_value) %>%
as_epi_archive() %>%
epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
ref_time_values = as.Date("2020-06-02"),
- n = 2, as_list_col = FALSE, names_sep = NULL)
+ before = 2, as_list_col = FALSE, names_sep = NULL)
```
## Multi-row outputs
@@ -181,7 +181,7 @@ df %>%
filter(time_value == max(time_value)),
interval = "prediction", level = 0.9)
))
- }, n = 2, new_col_name = "fc", names_sep = NULL)
+ }, before = 1, new_col_name = "fc", names_sep = NULL)
```
## Version-aware forecasting, revisited
@@ -222,7 +222,7 @@ x <- y1 %>%
version = issue,
percent_cli = value
) %>%
- as_epi_archive()
+ as_epi_archive(compactify=TRUE)
# mutating merge operation:
x$merge(y2 %>%
@@ -230,7 +230,7 @@ x$merge(y2 %>%
version = issue,
case_rate_7d_av = value
) %>%
- as_epi_archive()
+ as_epi_archive(compactify=TRUE)
)
```
@@ -344,7 +344,7 @@ data.
```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6}
# Latest snapshot of data, and forecast dates
x_latest <- epix_as_of(x, max_version = max(x$DT$version))
-fc_time_values <- seq(as.Date("2020-08-01"),
+fc_versions <- seq(as.Date("2020-08-01"),
as.Date("2021-12-01"),
by = "1 month")
@@ -354,15 +354,15 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) {
x %>%
epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value,
args = prob_arx_args(ahead = ahead)),
- n = 120, ref_time_values = fc_time_values) %>%
- mutate(target_date = time_value + ahead, as_of = TRUE,
+ before = 120, ref_time_values = fc_versions) %>%
+ mutate(target_date = version + ahead, as_of = TRUE,
geo_value = fc_geo_value)
}
else {
x_latest %>%
epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value,
args = prob_arx_args(ahead = ahead)),
- n = 120, ref_time_values = fc_time_values) %>%
+ before = 119, ref_time_values = fc_versions) %>%
mutate(target_date = time_value + ahead, as_of = FALSE)
}
}
diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd
index c3c71d3c..47097c9c 100644
--- a/vignettes/aggregation.Rmd
+++ b/vignettes/aggregation.Rmd
@@ -177,7 +177,7 @@ Explicit imputation for missingness (zero-filling in our case) can be important
for protecting against bugs in all sorts of downstream tasks. For example, even
something as simple as a 7-day trailing average is complicated by missingness.
The function `epi_slide()` looks for all rows within a window of 7 days anchored
-on the right at the reference time point (when `n = 7` and `align = "right"`).
+on the right at the reference time point (when `before = 6`).
But when some days in a given week are missing because they were censored
because they had small case counts, taking an average of the observed case
counts can be misleading and is unintentionally biased upwards. Meanwhile,
@@ -189,7 +189,7 @@ running `epi_slide()` on the zero-filled data brings these trailing averages
xt %>%
as_epi_df() %>%
group_by(geo_value) %>%
- epi_slide(cases_7dav = mean(cases), n = 7) %>%
+ epi_slide(cases_7dav = mean(cases), before = 6) %>%
filter(geo_value == "Plymouth, MA",
abs(time_value - as.Date("2021-07-01")) <= 3) %>%
print(n = 7)
@@ -197,7 +197,7 @@ xt %>%
xt_filled %>%
as_epi_df() %>%
group_by(geo_value) %>%
- epi_slide(cases_7dav = mean(cases), n = 7) %>%
+ epi_slide(cases_7dav = mean(cases), before = 6) %>%
filter(geo_value == "Plymouth, MA",
abs(time_value - as.Date("2021-07-01")) <= 3) %>%
print(n = 7)
diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd
index b78644ad..85151788 100644
--- a/vignettes/archive.Rmd
+++ b/vignettes/archive.Rmd
@@ -269,7 +269,7 @@ head(x$DT)
```
```{r, echo=FALSE, message=FALSE, warning=FALSE}
-x <- archive_cases_dv_subset
+x <- as_epi_archive(archive_cases_dv_subset$DT)
print(x)
head(x$DT)
```
@@ -300,52 +300,88 @@ slide vignette to accomodate exogenous variables in the autoregressive model,
which is often referred to as an ARX model.
```{r}
-prob_arx <- function(x, y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20,
- lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE,
- intercept = FALSE, nonneg = TRUE) {
+library(tidyr)
+library(purrr)
+
+prob_arx_args <- function(lags = c(0, 7, 14),
+ ahead = 7,
+ min_train_window = 20,
+ lower_level = 0.05,
+ upper_level = 0.95,
+ symmetrize = TRUE,
+ intercept = FALSE,
+ nonneg = TRUE) {
+ return(list(lags = lags,
+ ahead = ahead,
+ min_train_window = min_train_window,
+ lower_level = lower_level,
+ upper_level = upper_level,
+ symmetrize = symmetrize,
+ intercept = intercept,
+ nonneg = nonneg))
+}
+
+prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) {
# Return NA if insufficient training data
- if (length(y) < min_train_window + max(lags) + ahead) {
- return(data.frame(point = NA, lower = NA, upper = NA))
+ if (length(y) < args$min_train_window + max(args$lags) + args$ahead) {
+ return(data.frame(geo_value = unique(geo_value), # Return geo value!
+ point = NA, lower = NA, upper = NA))
}
- # Useful transformations
+ # Set up x, y, lags list
if (!missing(x)) x <- data.frame(x, y)
else x <- data.frame(y)
- if (!is.list(lags)) lags <- list(lags)
- lags = rep(lags, length.out = ncol(x))
+ if (!is.list(args$lags)) args$lags <- list(args$lags)
+ args$lags = rep(args$lags, length.out = ncol(x))
# Build features and response for the AR model, and then fit it
- dat <- do.call(
- data.frame,
- unlist( # Below we loop through and build the lagged features
- purrr::map(1:ncol(x), function(i) {
- purrr::map(lags[[i]], function(j) lag(x[,i], n = j))
- }),
- recursive = FALSE))
- names(dat) = paste0("x", 1:ncol(dat))
- if (intercept) dat$x0 = rep(1, nrow(dat))
- dat$y <- lead(y, n = ahead)
- obj <- lm(y ~ . + 0, data = dat)
+ dat <-
+ tibble(i = 1:ncol(x), lag = args$lags) %>%
+ unnest(lag) %>%
+ mutate(name = paste0("x", 1:nrow(.))) %>%
+ # One list element for each lagged feature
+ pmap(function(i, lag, name) {
+ tibble(geo_value = geo_value,
+ time_value = time_value + lag, # Shift back
+ !!name := x[,i])
+ }) %>%
+ # One list element for the response vector
+ c(list(
+ tibble(geo_value = geo_value,
+ time_value = time_value - args$ahead, # Shift forward
+ y = y))) %>%
+ # Combine them together into one data frame
+ reduce(full_join, by = c("geo_value", "time_value")) %>%
+ arrange(time_value)
+ if (args$intercept) dat$x0 = rep(1, nrow(dat))
+ obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value))
+
+ # Use LOCF to fill NAs in the latest feature values (do this by geo value)
+ setDT(dat) # Convert to a data.table object by reference
+ cols <- setdiff(names(dat), c("geo_value", "time_value"))
+ dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"]
- # Use LOCF to fill NAs in the latest feature values, make a prediction
- setDT(dat)
- setnafill(dat, type = "locf")
- point <- predict(obj, newdata = tail(dat, 1))
+ # Make predictions
+ test_time_value = max(time_value)
+ point <- predict(obj, newdata = dat %>%
+ dplyr::group_by(geo_value) %>%
+ dplyr::filter(time_value == test_time_value))
- # Compute a band
+ # Compute bands
r <- residuals(obj)
- s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized?
- q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE)
+ s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized?
+ q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE)
lower <- point + q[1]
upper <- point + q[2]
# Clip at zero if we need to, then return
- if (nonneg) {
- point = max(point, 0)
- lower = max(lower, 0)
- upper = max(upper, 0)
+ if (args$nonneg) {
+ point = pmax(point, 0)
+ lower = pmax(lower, 0)
+ upper = pmax(upper, 0)
}
- return(data.frame(point = point, lower = lower, upper = upper))
+ return(data.frame(geo_value = unique(geo_value), # Return geo value!
+ point = point, lower = lower, upper = upper))
}
```
@@ -353,12 +389,14 @@ Next we slide this forecaster over the working `epi_archive` object, in order to
forecast COVID-19 case rates 7 days into the future.
```{r}
-fc_time_values <- seq(as.Date("2020-08-01"),
+fc_versions <- seq(as.Date("2020-08-01"),
as.Date("2021-12-01"),
by = "1 month")
-z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate_7d_av), n = 120,
- ref_time_values = fc_time_values, group_by = geo_value)
+z <- epix_slide(x, fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value,
+ args = prob_arx_args(ahead = 7)),
+ before = 120,
+ ref_time_values = fc_versions, group_by = geo_value)
head(z, 10)
```
@@ -387,19 +425,22 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version))
k_week_ahead <- function(x, ahead = 7, as_of = TRUE) {
if (as_of) {
x %>%
- epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120,
- ref_time_values = fc_time_values, group_by = geo_value) %>%
- mutate(target_date = time_value + ahead, as_of = TRUE)
+ epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value,
+ args = prob_arx_args(ahead = ahead)),
+ before = 120, ref_time_values = fc_versions) %>%
+ mutate(target_date = version + ahead, as_of = TRUE,
+ geo_value = fc_geo_value)
}
else {
x_latest %>%
- group_by(geo_value) %>%
- epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120,
- ref_time_values = fc_time_values) %>%
+ epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value,
+ args = prob_arx_args(ahead = ahead)), before = 119,
+ ref_time_values = fc_versions) %>%
mutate(target_date = time_value + ahead, as_of = FALSE)
}
}
+
# Generate the forecasts, and bind them together
fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE),
k_week_ahead(x, ahead = 14, as_of = TRUE),
diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd
index 034235b3..a011ed43 100644
--- a/vignettes/compactify.Rmd
+++ b/vignettes/compactify.Rmd
@@ -102,7 +102,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x"))
# Performance of slide
slide_median <- function(my_ea) {
- my_ea$slide(median = median(case_rate_7d_av), n = 7)
+ my_ea$slide(median = median(case_rate_7d_av), before = 6)
}
speeds <- rbind(speeds, speed_test(slide_median,"slide_median"))
diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd
index 51170427..2620f8ce 100644
--- a/vignettes/slide.Rmd
+++ b/vignettes/slide.Rmd
@@ -71,7 +71,7 @@ order to smooth the signal, by passing in a formula for the first argument of
```{r}
x %>%
group_by(geo_value) %>%
- epi_slide(~ mean(.x$cases), n = 7) %>%
+ epi_slide(~ mean(.x$cases), before = 6) %>%
head(10)
```
@@ -85,7 +85,7 @@ front using the `new_col_name` argument:
```{r}
x <- x %>%
group_by(geo_value) %>%
- epi_slide(~ mean(.x$cases), n = 7, new_col_name = "cases_7dav")
+ epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav")
head(x, 10)
```
@@ -101,7 +101,7 @@ arguments. Recreating the last example of a 7-day trailing average:
```{r}
x <- x %>%
group_by(geo_value) %>%
- epi_slide(function(x, ...) mean(x$cases), n = 7, new_col_name = "cases_7dav")
+ epi_slide(function(x, ...) mean(x$cases), before = 6, new_col_name = "cases_7dav")
head(x, 10)
```
@@ -117,7 +117,7 @@ would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example:
```{r}
x <- x %>%
group_by(geo_value) %>%
- epi_slide(cases_7dav = mean(cases), n = 7)
+ epi_slide(cases_7dav = mean(cases), before = 6)
head(x, 10)
```
@@ -160,7 +160,7 @@ units of the `time_value` column; so, days, in the working `epi_df` being
considered in this vignette).
```{r}
-prob_ar <- function(y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20,
+prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20,
lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE,
intercept = FALSE, nonneg = TRUE) {
# Return NA if insufficient training data
@@ -209,7 +209,7 @@ fc_time_values <- seq(as.Date("2020-06-01"),
by = "1 months")
x %>%
group_by(geo_value) %>%
- epi_slide(fc = prob_ar(cases_7dav), n = 120,
+ epi_slide(fc = prob_ar(cases_7dav), before = 119,
ref_time_values = fc_time_values) %>%
head(10)
```
@@ -233,7 +233,7 @@ so that we can call it a few times.
k_week_ahead <- function(x, ahead = 7) {
x %>%
group_by(geo_value) %>%
- epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), n = 120,
+ epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), before = 119,
ref_time_values = fc_time_values, all_rows = TRUE) %>%
mutate(target_date = time_value + ahead)
}