cmu-delphi
diff --git a/‎NAMESPACE
+3 b/‎NAMESPACE
+3
diff --git a/‎NEWS.md
+10 b/‎NEWS.md
+10
diff --git a/‎R/archive.R
+26-31 b/‎R/archive.R
+26-31
diff --git a/‎R/grouped_epi_archive.R
+14-7 b/‎R/grouped_epi_archive.R
+14-7
diff --git a/‎R/methods-epi_archive.R
+10-4 b/‎R/methods-epi_archive.R
+10-4
diff --git a/‎R/slide.R
+14-7 b/‎R/slide.R
+14-7
@@ -14,11 +14,13 @@ S3method(group_by,epi_archive)
 S3method(group_by,epi_df)
 S3method(group_by,grouped_epi_archive)
 S3method(group_by_drop_default,grouped_epi_archive)
+S3method(groups,grouped_epi_archive)
 S3method(next_after,Date)
 S3method(next_after,integer)
 S3method(print,epi_df)
 S3method(summary,epi_df)
 S3method(ungroup,epi_df)
+S3method(ungroup,grouped_epi_archive)
 S3method(unnest,epi_df)
 export("%>%")
 export(archive_cases_dv_subset)
@@ -67,6 +69,7 @@ importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)
 importFrom(dplyr,group_modify)
+importFrom(dplyr,groups)
 importFrom(dplyr,mutate)
 importFrom(dplyr,relocate)
 importFrom(dplyr,rename)
 
@@ -23,6 +23,14 @@ development versions. A ".9999" suffix indicates a development version.
   `epi_slide`.
   * To obtain the old behavior, `dplyr::ungroup` the `epix_slide` result
     immediately.
+* `epix_slide` now guesses `ref_time_values` to be a regularly spaced sequence
+  covering all the `DT$version` values and `version_end`, rather than the
+  distinct `DT$time_value`s. To obtain the old behavior, pass in
+  `ref_time_values = unique(<ungrouped archive>$DT$time_value)`.
+* `epi_archive`'s `clobberable_versions_start`'s default is now `NA`, so there
+  will be no warnings by default about potential nonreproducibility. To obtain
+  the old behavior, pass in `clobberable_versions_start =
+  max_version_with_row_in(x)`.
 
 ## Potentially-breaking changes:
 
@@ -34,6 +42,8 @@ development versions. A ".9999" suffix indicates a development version.
 * Changed `bind_rows` on grouped `epi_df`s to not drop the `epi_df` class. Like
   with ungrouped `epi_df`s, the metadata of the result is still simply taken
   from the first result, and may be inappropriate (#242).
+* `epi_slide` and `epix_slide` now raise an error rather than silently filtering
+  out `ref_time_values` that don't meet their expectations.
 
 ## Improvements:
 
 
@@ -61,7 +61,7 @@ validate_version_bound = function(version_bound, x, na_ok,
   }
 }
 
-#' Default arg helper: `max(x$version)`, with error if `x` has 0 rows
+#' `max(x$version)`, with error if `x` has 0 rows
 #'
 #' Exported to make defaults more easily copyable.
 #'
@@ -233,15 +233,17 @@ epi_archive =
 #'   carried forward (LOCF) to interpolate between the version data provided,
 #'   rows that don't change these LOCF results can potentially be omitted to
 #'   save space while maintaining the same behavior (with the help of the
-#'   `clobberable_versions_start` and `versions_end` fields in some
-#'   edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing
-#'   or `NULL` will remove these rows and issue a warning. Generally, this can
-#'   be set to `TRUE`, but if you directly inspect or edit the fields of the
-#'   `epi_archive` such as its `DT`, you will have to determine whether
-#'   `compactify=TRUE` will produce the desired results. If compactification
-#'   here is removing a large proportion of the rows, this may indicate a
-#'   potential for space, time, or bandwidth savings upstream the data pipeline,
-#'   e.g., when fetching, storing, or preparing the input data `x`
+#'   `clobberable_versions_start` and `versions_end` fields in some edge cases).
+#'   `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will
+#'   remove these rows and issue a warning. Generally, this can be set to
+#'   `TRUE`, but if you directly inspect or edit the fields of the `epi_archive`
+#'   such as its `DT`, or rely on redundant updates to achieve a certain
+#'   behavior of the `ref_time_values` default in `epix_slide`, you will have to
+#'   determine whether `compactify=TRUE` will produce the desired results. If
+#'   compactification here is removing a large proportion of the rows, this may
+#'   indicate a potential for space, time, or bandwidth savings upstream the
+#'   data pipeline, e.g., by avoiding fetching, storing, or processing these
+#'   rows of `x`.
 #' @param clobberable_versions_start Optional; as in [`as_epi_archive`]
 #' @param versions_end Optional; as in [`as_epi_archive`]
 #' @return An `epi_archive` object.
@@ -308,7 +310,7 @@ epi_archive =
             # Apply defaults and conduct checks and apply defaults for
             # `clobberable_versions_start`, `versions_end`:
             if (missing(clobberable_versions_start)) {
-              clobberable_versions_start <- max_version_with_row_in(x)
+              clobberable_versions_start <- NA
             }
             if (missing(versions_end)) {
               versions_end <- max_version_with_row_in(x)
@@ -465,7 +467,7 @@ epi_archive =
               Abort("`max_version` must be at most `self$versions_end`.")
             }
             if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) {
-              Warn('Getting data as of some "clobberable" version that might be hotfixed, synced, or otherwise replaced later with different data using the same version tag.  Thus, the snapshot that we produce here might not be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.',
+              Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered").  Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.',
                    class="epiprocess__snapshot_as_of_clobberable_version")
             }
 
@@ -642,25 +644,18 @@ epi_archive =
 #'   same `class` and `typeof` as `x$version`, or an `NA` of any `class` and
 #'   `typeof`: specifically, either (a) the earliest version that could be
 #'   subject to "clobbering" (being overwritten with different update data, but
-#'   using the same version tag as the old update data), or (b) `NA`, to
+#'   using the *same* version tag as the old update data), or (b) `NA`, to
 #'   indicate that no versions are clobberable. There are a variety of reasons
-#'   why versions could be clobberable, such as upstream hotfixes to the latest
-#'   version, or delays in data synchronization that were mistaken for versions
-#'   with no updates; potential causes vary between different data pipelines.
-#'   The default value is `max_version_with_row_in(x)`; this default assumes
-#'   that (i) if a row in `x` (even one that `compactify` would consider
-#'   redundant) is present with version `ver`, then all previous versions must
-#'   be finalized and non-clobberable, although `ver` (and onward) might still
-#'   be modified, (ii) even if we have "observed" empty updates for some
-#'   versions beyond `max(x$version)` (as indicated by `versions_end`;
-#'   see below), we can't assume `max(x$version)` has been finalized, because we
-#'   might see a nonfinalized version + empty subsequent versions due to
-#'   upstream database replication delays in combination with the upstream
-#'   replicas using last-version-carried-forward to extrapolate that there were
-#'   no updates, (iii) "redundant" update rows that would be removed by
-#'   `compactify` are not redundant, and actually come from an explicit version
-#'   release that indicates that preceding versions are finalized. If `nrow(x)
-#'   == 0`, then this argument is mandatory.
+#'   why versions could be clobberable under routine circumstances, such as (a)
+#'   today's version of one/all of the columns being published after initially
+#'   being filled with `NA` or LOCF, (b) a buggy version of today's data being
+#'   published but then fixed and republished later in the day, or (c) data
+#'   pipeline delays (e.g., publisher uploading, periodic scraping, database
+#'   syncing, periodic fetching, etc.) that make events (a) or (b) reflected
+#'   later in the day (or even on a different day) than expected; potential
+#'   causes vary between different data pipelines. The default value is `NA`,
+#'   which doesn't consider any versions to be clobberable. Another setting that
+#'   may be appropriate for some pipelines is `max_version_with_row_in(x)`.
 #' @param versions_end Optional; length-1, same `class` and `typeof` as
 #'   `x$version`: what is the last version we have observed? The default is
 #'   `max_version_with_row_in(x)`, but values greater than this could also be
@@ -717,7 +712,7 @@ epi_archive =
 as_epi_archive = function(x, geo_type, time_type, other_keys,
                           additional_metadata = list(),
                           compactify = NULL,
-                          clobberable_versions_start = max_version_with_row_in(x),
+                          clobberable_versions_start = NA,
                           versions_end = max_version_with_row_in(x)) {
   epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata,
                   compactify, clobberable_versions_start, versions_end)
 
@@ -189,14 +189,21 @@ grouped_epi_archive =
               ")
             }
 
-            # If missing, then set ref time values to be everything; else make
-            # sure we intersect with observed time values 
             if (missing(ref_time_values)) {
-              ref_time_values = unique(private$ungrouped$DT$time_value)
-            }
-            else {
-              ref_time_values = ref_time_values[ref_time_values %in%
-                                                unique(private$ungrouped$DT$time_value)]
+              versions_with_updates = c(private$ungrouped$DT$version, private$ungrouped$versions_end)
+              ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates))
+            } else if (length(ref_time_values) == 0L) {
+              Abort("`ref_time_values` must have at least one element.")
+            } else if (any(is.na(ref_time_values))) {
+              Abort("`ref_time_values` must not include `NA`.")
+            } else if (anyDuplicated(ref_time_values) != 0L) {
+              Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.")
+            } else if (any(ref_time_values > private$ungrouped$versions_end)) {
+              Abort("All `ref_time_values` must be `<=` the `versions_end`.")
+            } else {
+              # Sort, for consistency with `epi_slide`, although the current
+              # implementation doesn't take advantage of it.
+              ref_time_values = sort(ref_time_values)
             }
 
             # Validate and pre-process `before`:
 
@@ -573,10 +573,13 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
 #'   were to hold forecasts, then we would expect data for `time_value`s after
 #'   January 8, and the sliding window would extend as far after each
 #'   `ref_time_value` as needed to include all such `time_value`s.)
-#' @param ref_time_values Time values for sliding computations, meaning, each
-#'   element of this vector serves as the reference time point for one sliding
-#'   window. If missing, then this will be set to all unique time values in the
-#'   underlying data table, by default.
+#' @param ref_time_values Reference time values / versions for sliding
+#'   computations; each element of this vector serves both as the anchor point
+#'   for the `time_value` window for the computation and the `max_version`
+#'   `as_of` which we fetch data in this window. If missing, then this will set
+#'   to a regularly-spaced sequence of values set to cover the range of
+#'   `version`s in the `DT` plus the `versions_end`; the spacing of values will
+#'   be guessed (using the GCD of the skips between values).
 #' @param time_step Optional function used to define the meaning of one time
 #'   step, which if specified, overrides the default choice based on the
 #'   `time_value` column. This function must take a positive integer and return
@@ -633,6 +636,9 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
 #'   `time_value`, and all `other_keys` present in the version data with
 #'   `time_value` matching one of the `ref_time_values`, this can have unexpected
 #'   behaviors due reporting latency or reporting dropping in and out.
+#'   6. The `ref_time_values` default for `epix_slide` is based on making an
+#'   evenly-spaced sequence out of the `version`s in the `DT` plus the
+#'   `versions_end`, rather than the `time_value`s.
 #' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are
 #' the same.
 #'
 
@@ -124,16 +124,23 @@ epi_slide = function(x, f, ..., n, ref_time_values,
   # Arrange by increasing time_value
   x = arrange(x, time_value)
 
-  # If missing, then set ref time values to be everything; else make sure we
-  # intersect with observed time values
   if (missing(ref_time_values)) {
     ref_time_values = unique(x$time_value)
-  } 
-  else {
-    ref_time_values = ref_time_values[ref_time_values %in%
-                                      unique(x$time_value)] 
   }
-              
+  # Some of the checks below are possible to fail on the above default; just go
+  # ahead and do the full validation & pre-processing even on the default:
+  if (length(ref_time_values) == 0L) {
+    Abort("`ref_time_values` must have at least one element.")
+  } else if (any(is.na(ref_time_values))) {
+    Abort("`ref_time_values` must not include `NA`.")
+  } else if (anyDuplicated(ref_time_values) != 0L) {
+    Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.")
+  } else if (!all(ref_time_values %in% unique(x$time_value))) {
+    Abort("All `ref_time_values` must appear in `x$time_value`.")
+  } else {
+    ref_time_values = sort(ref_time_values)
+  }
+
   # If before is missing, then use align to set up alignment
   if (missing(before)) {
     align = match.arg(align)