Skip to content

Commit 2df885a

Browse files
authored
Merge pull request #131 from dajmcdon/km-fix-utils
Updated and fixed utils, included adding tests. Issue #127 will need to be accounted for later.
2 parents e88a26f + 297900c commit 2df885a

File tree

6 files changed

+247
-54
lines changed

6 files changed

+247
-54
lines changed

R/archive.R

+54-49
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of
1717
#' class `data.table` from the `data.table` package, with (at least) the
1818
#' following columns:
19-
#'
19+
#'
2020
#' * `geo_value`: the geographic value associated with each row of measurements.
2121
#' * `time_value`: the time value associated with each row of measurements.
2222
#' * `version`: the time value specifying the version for each row of
@@ -31,7 +31,7 @@
3131
#' on `DT` directly). There can only be a single row per unique combination of
3232
#' key variables, and thus the key variables are critical for figuring out how
3333
#' to generate a snapshot of data from the archive, as of a given version.
34-
#'
34+
#'
3535
#' In general, last observation carried forward (LOCF) is used to data in
3636
#' between recorded versions. Currently, deletions must be represented as
3737
#' revising a row to a special state (e.g., making the entries `NA` or
@@ -43,7 +43,7 @@
4343
#' reference semantics. A primary consequence of this is that objects are not
4444
#' copied when modified. You can read more about this in Hadley Wickham's
4545
#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.
46-
#'
46+
#'
4747
#' @section Metadata:
4848
#' The following pieces of metadata are included as fields in an `epi_archive`
4949
#' object:
@@ -75,7 +75,7 @@
7575
#' sliding computation at any given reference time point t is performed on
7676
#' **data that would have been available as of t**. More details on `slide()`
7777
#' are documented in the wrapper function `epix_slide()`.
78-
#'
78+
#'
7979
#' @importFrom R6 R6Class
8080
#' @export
8181
epi_archive =
@@ -89,7 +89,7 @@ epi_archive =
8989
additional_metadata = NULL,
9090
#' @description Creates a new `epi_archive` object.
9191
#' @param x A data frame, data table, or tibble, with columns `geo_value`,
92-
#' `time_value`, `version`, and then any additional number of columns.
92+
#' `time_value`, `version`, and then any additional number of columns.
9393
#' @param geo_type Type for the geo values. If missing, then the function will
9494
#' attempt to infer it from the geo values present; if this fails, then it
9595
#' will be set to "custom".
@@ -105,12 +105,12 @@ epi_archive =
105105
#' @return An `epi_archive` object.
106106
#' @importFrom data.table as.data.table key setkeyv
107107
initialize = function(x, geo_type, time_type, other_keys,
108-
additional_metadata) {
108+
additional_metadata) {
109109
# Check that we have a data frame
110110
if (!is.data.frame(x)) {
111111
Abort("`x` must be a data frame.")
112112
}
113-
113+
114114
# Check that we have geo_value, time_value, version columns
115115
if (!("geo_value" %in% names(x))) {
116116
Abort("`x` must contain a `geo_value` column.")
@@ -121,7 +121,7 @@ epi_archive =
121121
if (!("version" %in% names(x))) {
122122
Abort("`x` must contain a `version` column.")
123123
}
124-
124+
125125
# If geo type is missing, then try to guess it
126126
if (missing(geo_type)) {
127127
geo_type = guess_geo_type(x$geo_value)
@@ -131,7 +131,7 @@ epi_archive =
131131
if (missing(time_type)) {
132132
time_type = guess_time_type(x$time_value)
133133
}
134-
134+
135135
# Finish off with small checks on keys variables and metadata
136136
if (missing(other_keys)) other_keys = NULL
137137
if (missing(additional_metadata)) additional_metadata = list()
@@ -145,7 +145,7 @@ epi_archive =
145145
c("geo_type", "time_type"))) {
146146
Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".")
147147
}
148-
148+
149149
# Create the data table; if x was an un-keyed data.table itself,
150150
# then the call to as.data.table() will fail to set keys, so we
151151
# need to check this, then do it manually if needed
@@ -163,8 +163,8 @@ epi_archive =
163163
cat("An `epi_archive` object, with metadata:\n")
164164
cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type))
165165
cat(sprintf("* %-9s = %s\n", "time_type", self$time_type))
166-
if (!is.null(self$additional_metadata)) {
167-
sapply(self$additional_metadata, function(m) {
166+
if (!is.null(self$additional_metadata)) {
167+
sapply(self$additional_metadata, function(m) {
168168
cat(sprintf("* %-9s = %s\n", names(m), m))
169169
})
170170
}
@@ -178,10 +178,15 @@ epi_archive =
178178
cat(sprintf("* %-14s = %s\n", "max version",
179179
max(self$DT$version)))
180180
cat("----------\n")
181-
cat(sprintf("Data archive (stored in DT field): %i x %i\n",
181+
cat(sprintf("Data archive (stored in DT field): %i x %i\n",
182182
nrow(self$DT), ncol(self$DT)))
183183
cat("----------\n")
184-
cat(sprintf("Public methods: %s",
184+
cat(sprintf("Columns in DT: %s\n", paste(ifelse(length(
185+
colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "),
186+
paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and",
187+
length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns")))))
188+
cat("----------\n")
189+
cat(sprintf("Public methods: %s\n",
185190
paste(names(epi_archive$public_methods),
186191
collapse = ", ")))
187192
},
@@ -195,7 +200,7 @@ epi_archive =
195200
other_keys = setdiff(key(self$DT),
196201
c("geo_value", "time_value", "version"))
197202
if (length(other_keys) == 0) other_keys = NULL
198-
203+
199204
# Check a few things on max_version
200205
if (!identical(class(max_version), class(self$DT$version))) {
201206
Abort("`max_version` and `DT$version` must have same class.")
@@ -209,23 +214,23 @@ epi_archive =
209214
if (max_version == self_max) {
210215
Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).")
211216
}
212-
217+
213218
# Filter by version and return
214219
return(
215220
# Make sure to use data.table ways of filtering and selecting
216221
self$DT[time_value >= min_time_value &
217222
version <= max_version, ] %>%
218223
unique(by = c("geo_value", "time_value", other_keys),
219224
fromLast = TRUE) %>%
220-
tibble::as_tibble() %>%
225+
tibble::as_tibble() %>%
221226
dplyr::select(-.data$version) %>%
222227
as_epi_df(geo_type = self$geo_type,
223228
time_type = self$time_type,
224229
as_of = max_version,
225230
additional_metadata = c(self$additional_metadata,
226231
other_keys = other_keys))
227232
)
228-
},
233+
},
229234
#####
230235
#' @description Merges another `data.table` with the current one, and allows for
231236
#' a post-filling of `NA` values by last observation carried forward (LOCF).
@@ -234,7 +239,7 @@ epi_archive =
234239
merge = function(y, ..., locf = TRUE, nan = NA) {
235240
# Check we have a `data.table` object
236241
if (!(inherits(y, "data.table") || inherits(y, "epi_archive"))) {
237-
Abort("`y` must be of class `data.table` or `epi_archive`.")
242+
Abort("`y` must be of class `data.table` or `epi_archive`.")
238243
}
239244

240245
# Use the data.table merge function, carrying through ... args
@@ -249,42 +254,42 @@ epi_archive =
249254

250255
# Important: use nafill and not setnafill because the latter
251256
# returns the entire data frame by reference, and the former can
252-
# be set to act on particular columns by reference using :=
257+
# be set to act on particular columns by reference using :=
253258
self$DT[,
254-
(cols) := nafill(.SD, type = "locf", nan = nan),
255-
.SDcols = cols,
259+
(cols) := nafill(.SD, type = "locf", nan = nan),
260+
.SDcols = cols,
256261
by = by]
257262
}
258-
},
263+
},
259264
#####
260265
#' @description Slides a given function over variables in an `epi_archive`
261266
#' object. See the documentation for the wrapper function `epix_as_of()` for
262-
#' details.
267+
#' details.
263268
#' @importFrom data.table key
264269
#' @importFrom rlang !! !!! enquo enquos is_quosure sym syms
265-
slide = function(f, ..., n = 7, group_by, ref_time_values,
270+
slide = function(f, ..., n = 7, group_by, ref_time_values,
266271
time_step, new_col_name = "slide_value",
267272
as_list_col = FALSE, names_sep = "_",
268-
all_rows = FALSE) {
273+
all_rows = FALSE) {
269274
# If missing, then set ref time values to be everything; else make
270-
# sure we intersect with observed time values
275+
# sure we intersect with observed time values
271276
if (missing(ref_time_values)) {
272277
ref_time_values = unique(self$DT$time_value)
273278
}
274279
else {
275280
ref_time_values = ref_time_values[ref_time_values %in%
276281
unique(self$DT$time_value)]
277282
}
278-
279-
# If a custom time step is specified, then redefine units
283+
284+
# If a custom time step is specified, then redefine units
280285
before_num = n-1
281286
if (!missing(time_step)) before_num = time_step(n-1)
282-
287+
283288
# What to group by? If missing, set according to internal keys
284289
if (missing(group_by)) {
285290
group_by = setdiff(key(self$DT), c("time_value", "version"))
286291
}
287-
292+
288293
# Symbolize column name, defuse grouping variables. We have to do
289294
# the middle step here which is a bit complicated (unfortunately)
290295
# since the function epix_slide() could have called the current one,
@@ -296,20 +301,20 @@ epi_archive =
296301

297302
# Key variable names, apart from time value and version
298303
key_vars = setdiff(key(self$DT), c("time_value", "version"))
299-
304+
300305
# Computation for one group, one time value
301306
comp_one_grp = function(.data_group,
302-
f, ...,
307+
f, ...,
303308
time_value,
304309
key_vars,
305310
new_col) {
306-
# Carry out the specified computation
311+
# Carry out the specified computation
307312
comp_value = f(.data_group, ...)
308313

309314
# Count the number of appearances of the reference time value.
310315
# Note: ideally, we want to directly count occurrences of the ref
311316
# time value but due to latency, this will often not appear in the
312-
# data group. So we count the number of unique key values, outside
317+
# data group. So we count the number of unique key values, outside
313318
# of the time value column
314319
count = sum(!duplicated(.data_group[, key_vars]))
315320

@@ -343,23 +348,23 @@ epi_archive =
343348
else {
344349
Abort("The slide computation must return an atomic vector or a data frame.")
345350
}
346-
351+
347352
# Note that we've already recycled comp value to make size stable,
348353
# so tibble() will just recycle time value appropriately
349-
return(tibble::tibble(time_value = time_value,
354+
return(tibble::tibble(time_value = time_value,
350355
!!new_col := comp_value))
351356
}
352-
357+
353358
# If f is not missing, then just go ahead, slide by group
354359
if (!missing(f)) {
355360
if (rlang::is_formula(f)) f = rlang::as_function(f)
356-
361+
357362
x = purrr::map_dfr(ref_time_values, function(t) {
358363
self$as_of(t, min_time_value = t - before_num) %>%
359-
tibble::as_tibble() %>%
364+
tibble::as_tibble() %>%
360365
dplyr::group_by(!!!group_by) %>%
361366
dplyr::group_modify(comp_one_grp,
362-
f = f, ...,
367+
f = f, ...,
363368
time_value = t,
364369
key_vars = key_vars,
365370
new_col = new_col,
@@ -377,14 +382,14 @@ epi_archive =
377382
if (length(quos) > 1) {
378383
Abort("If `f` is missing then only a single computation can be specified via `...`.")
379384
}
380-
385+
381386
quo = quos[[1]]
382387
f = function(x, quo, ...) rlang::eval_tidy(quo, x)
383388
new_col = sym(names(rlang::quos_auto_name(quos)))
384389

385390
x = purrr::map_dfr(ref_time_values, function(t) {
386391
self$as_of(t, min_time_value = t - before_num) %>%
387-
tibble::as_tibble() %>%
392+
tibble::as_tibble() %>%
388393
dplyr::group_by(!!!group_by) %>%
389394
dplyr::group_modify(comp_one_grp,
390395
f = f, quo = quo,
@@ -395,12 +400,12 @@ epi_archive =
395400
dplyr::ungroup()
396401
})
397402
}
398-
403+
399404
# Unnest if we need to
400405
if (!as_list_col) {
401406
x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
402407
}
403-
408+
404409
# Join to get all rows, if we need to, then return
405410
if (all_rows) {
406411
cols = c(as.character(group_by), "time_value")
@@ -411,7 +416,7 @@ epi_archive =
411416
}
412417
)
413418
)
414-
419+
415420
#' Convert to `epi_archive` format
416421
#'
417422
#' Converts a data frame, data table, or tibble into an `epi_archive`
@@ -464,15 +469,15 @@ epi_archive =
464469
#' time_type = "day",
465470
#' other_keys = "county")
466471
as_epi_archive = function(x, geo_type, time_type, other_keys,
467-
additional_metadata = list()) {
468-
epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)
472+
additional_metadata = list()) {
473+
epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)
469474
}
470475

471476
#' Test for `epi_archive` format
472477
#'
473478
#' @param x An object.
474479
#' @return `TRUE` if the object inherits from `epi_archive`.
475-
#'
480+
#'
476481
#' @export
477482
#' @examples
478483
#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive)

R/slide.R

+26-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,27 @@
8484
#' tidy evaluation (first example, above), then the name for the new column is
8585
#' inferred from the given expression and overrides any name passed explicitly
8686
#' through the `new_col_name` argument.
87-
#'
87+
#'
88+
#' When `f` is a named function with arguments, if a tibble with an unnamed
89+
#' grouping variable is passed in as the method argument to `f`, include a
90+
#' parameter for the grouping-variable in `function()` just prior to
91+
#' specifying the method to prevent that from being overridden. For example:
92+
#' ```
93+
#' # Construct an tibble with an unnamed grouping variable
94+
#' edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01")
95+
#' + 1:10, x1=1:10, y=1:10 + rnorm(10L))) %>%
96+
#' as_epi_df()
97+
#'
98+
#' # Now, include a row parameter for the grouping variable in the tibble,
99+
#' # which we denote as g, just prior to method = "qr"
100+
#' # Note that if g was not included below, then the method = "qr" would be
101+
#' # overridden, as described above
102+
#' edf %>%
103+
#' group_by(geo_value) %>%
104+
#' epi_slide(function(x, g, method="qr", ...) tibble(model=list(
105+
#' lm(y ~ x1, x, method=method))), n=7L)
106+
#' ```
107+
#'
88108
#' @importFrom lubridate days weeks
89109
#' @importFrom rlang .data .env !! enquo enquos sym
90110
#' @export
@@ -125,7 +145,7 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values,
125145
# intersect with observed time values
126146
if (missing(ref_time_values)) {
127147
ref_time_values = unique(x$time_value)
128-
}
148+
}
129149
else {
130150
ref_time_values = ref_time_values[ref_time_values %in%
131151
unique(x$time_value)]
@@ -168,6 +188,10 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values,
168188
time_range = range(unique(x$time_value))
169189
starts = in_range(ref_time_values - before_num, time_range)
170190
stops = in_range(ref_time_values + after_num, time_range)
191+
192+
if( length(starts) == 0 || length(stops) == 0 ) {
193+
Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
194+
}
171195

172196
# Symbolize new column name
173197
new_col = sym(new_col_name)

0 commit comments

Comments
 (0)