16
16
# ' @details An `epi_archive` is an R6 class which contains a data table `DT`, of
17
17
# ' class `data.table` from the `data.table` package, with (at least) the
18
18
# ' following columns:
19
- # '
19
+ # '
20
20
# ' * `geo_value`: the geographic value associated with each row of measurements.
21
21
# ' * `time_value`: the time value associated with each row of measurements.
22
22
# ' * `version`: the time value specifying the version for each row of
31
31
# ' on `DT` directly). There can only be a single row per unique combination of
32
32
# ' key variables, and thus the key variables are critical for figuring out how
33
33
# ' to generate a snapshot of data from the archive, as of a given version.
34
- # '
34
+ # '
35
35
# ' In general, last observation carried forward (LOCF) is used to data in
36
36
# ' between recorded versions. Currently, deletions must be represented as
37
37
# ' revising a row to a special state (e.g., making the entries `NA` or
43
43
# ' reference semantics. A primary consequence of this is that objects are not
44
44
# ' copied when modified. You can read more about this in Hadley Wickham's
45
45
# ' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.
46
- # '
46
+ # '
47
47
# ' @section Metadata:
48
48
# ' The following pieces of metadata are included as fields in an `epi_archive`
49
49
# ' object:
75
75
# ' sliding computation at any given reference time point t is performed on
76
76
# ' **data that would have been available as of t**. More details on `slide()`
77
77
# ' are documented in the wrapper function `epix_slide()`.
78
- # '
78
+ # '
79
79
# ' @importFrom R6 R6Class
80
80
# ' @export
81
81
epi_archive =
@@ -89,7 +89,7 @@ epi_archive =
89
89
additional_metadata = NULL ,
90
90
# ' @description Creates a new `epi_archive` object.
91
91
# ' @param x A data frame, data table, or tibble, with columns `geo_value`,
92
- # ' `time_value`, `version`, and then any additional number of columns.
92
+ # ' `time_value`, `version`, and then any additional number of columns.
93
93
# ' @param geo_type Type for the geo values. If missing, then the function will
94
94
# ' attempt to infer it from the geo values present; if this fails, then it
95
95
# ' will be set to "custom".
@@ -105,12 +105,12 @@ epi_archive =
105
105
# ' @return An `epi_archive` object.
106
106
# ' @importFrom data.table as.data.table key setkeyv
107
107
initialize = function (x , geo_type , time_type , other_keys ,
108
- additional_metadata ) {
108
+ additional_metadata ) {
109
109
# Check that we have a data frame
110
110
if (! is.data.frame(x )) {
111
111
Abort(" `x` must be a data frame." )
112
112
}
113
-
113
+
114
114
# Check that we have geo_value, time_value, version columns
115
115
if (! (" geo_value" %in% names(x ))) {
116
116
Abort(" `x` must contain a `geo_value` column." )
@@ -121,7 +121,7 @@ epi_archive =
121
121
if (! (" version" %in% names(x ))) {
122
122
Abort(" `x` must contain a `version` column." )
123
123
}
124
-
124
+
125
125
# If geo type is missing, then try to guess it
126
126
if (missing(geo_type )) {
127
127
geo_type = guess_geo_type(x $ geo_value )
@@ -131,7 +131,7 @@ epi_archive =
131
131
if (missing(time_type )) {
132
132
time_type = guess_time_type(x $ time_value )
133
133
}
134
-
134
+
135
135
# Finish off with small checks on keys variables and metadata
136
136
if (missing(other_keys )) other_keys = NULL
137
137
if (missing(additional_metadata )) additional_metadata = list ()
@@ -145,7 +145,7 @@ epi_archive =
145
145
c(" geo_type" , " time_type" ))) {
146
146
Warn(" `additional_metadata` names overlap with existing metadata fields \" geo_type\" , \" time_type\" ." )
147
147
}
148
-
148
+
149
149
# Create the data table; if x was an un-keyed data.table itself,
150
150
# then the call to as.data.table() will fail to set keys, so we
151
151
# need to check this, then do it manually if needed
@@ -163,8 +163,8 @@ epi_archive =
163
163
cat(" An `epi_archive` object, with metadata:\n " )
164
164
cat(sprintf(" * %-9s = %s\n " , " geo_type" , self $ geo_type ))
165
165
cat(sprintf(" * %-9s = %s\n " , " time_type" , self $ time_type ))
166
- if (! is.null(self $ additional_metadata )) {
167
- sapply(self $ additional_metadata , function (m ) {
166
+ if (! is.null(self $ additional_metadata )) {
167
+ sapply(self $ additional_metadata , function (m ) {
168
168
cat(sprintf(" * %-9s = %s\n " , names(m ), m ))
169
169
})
170
170
}
@@ -178,10 +178,15 @@ epi_archive =
178
178
cat(sprintf(" * %-14s = %s\n " , " max version" ,
179
179
max(self $ DT $ version )))
180
180
cat(" ----------\n " )
181
- cat(sprintf(" Data archive (stored in DT field): %i x %i\n " ,
181
+ cat(sprintf(" Data archive (stored in DT field): %i x %i\n " ,
182
182
nrow(self $ DT ), ncol(self $ DT )))
183
183
cat(" ----------\n " )
184
- cat(sprintf(" Public methods: %s" ,
184
+ cat(sprintf(" Columns in DT: %s\n " , paste(ifelse(length(
185
+ colnames(self $ DT )) < = 4 , paste(colnames(self $ DT ), collapse = " , " ),
186
+ paste(paste(colnames(self $ DT )[1 : 4 ], collapse = " , " ), " and" ,
187
+ length(colnames(self $ DT )[5 : length(colnames(self $ DT ))]), " more columns" )))))
188
+ cat(" ----------\n " )
189
+ cat(sprintf(" Public methods: %s\n " ,
185
190
paste(names(epi_archive $ public_methods ),
186
191
collapse = " , " )))
187
192
},
@@ -195,7 +200,7 @@ epi_archive =
195
200
other_keys = setdiff(key(self $ DT ),
196
201
c(" geo_value" , " time_value" , " version" ))
197
202
if (length(other_keys ) == 0 ) other_keys = NULL
198
-
203
+
199
204
# Check a few things on max_version
200
205
if (! identical(class(max_version ), class(self $ DT $ version ))) {
201
206
Abort(" `max_version` and `DT$version` must have same class." )
@@ -209,23 +214,23 @@ epi_archive =
209
214
if (max_version == self_max ) {
210
215
Warn(" Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization)." )
211
216
}
212
-
217
+
213
218
# Filter by version and return
214
219
return (
215
220
# Make sure to use data.table ways of filtering and selecting
216
221
self $ DT [time_value > = min_time_value &
217
222
version < = max_version , ] %> %
218
223
unique(by = c(" geo_value" , " time_value" , other_keys ),
219
224
fromLast = TRUE ) %> %
220
- tibble :: as_tibble() %> %
225
+ tibble :: as_tibble() %> %
221
226
dplyr :: select(- .data $ version ) %> %
222
227
as_epi_df(geo_type = self $ geo_type ,
223
228
time_type = self $ time_type ,
224
229
as_of = max_version ,
225
230
additional_metadata = c(self $ additional_metadata ,
226
231
other_keys = other_keys ))
227
232
)
228
- },
233
+ },
229
234
# ####
230
235
# ' @description Merges another `data.table` with the current one, and allows for
231
236
# ' a post-filling of `NA` values by last observation carried forward (LOCF).
@@ -234,7 +239,7 @@ epi_archive =
234
239
merge = function (y , ... , locf = TRUE , nan = NA ) {
235
240
# Check we have a `data.table` object
236
241
if (! (inherits(y , " data.table" ) || inherits(y , " epi_archive" ))) {
237
- Abort(" `y` must be of class `data.table` or `epi_archive`." )
242
+ Abort(" `y` must be of class `data.table` or `epi_archive`." )
238
243
}
239
244
240
245
# Use the data.table merge function, carrying through ... args
@@ -249,42 +254,42 @@ epi_archive =
249
254
250
255
# Important: use nafill and not setnafill because the latter
251
256
# returns the entire data frame by reference, and the former can
252
- # be set to act on particular columns by reference using :=
257
+ # be set to act on particular columns by reference using :=
253
258
self $ DT [,
254
- (cols ) : = nafill(.SD , type = " locf" , nan = nan ),
255
- .SDcols = cols ,
259
+ (cols ) : = nafill(.SD , type = " locf" , nan = nan ),
260
+ .SDcols = cols ,
256
261
by = by ]
257
262
}
258
- },
263
+ },
259
264
# ####
260
265
# ' @description Slides a given function over variables in an `epi_archive`
261
266
# ' object. See the documentation for the wrapper function `epix_as_of()` for
262
- # ' details.
267
+ # ' details.
263
268
# ' @importFrom data.table key
264
269
# ' @importFrom rlang !! !!! enquo enquos is_quosure sym syms
265
- slide = function (f , ... , n = 7 , group_by , ref_time_values ,
270
+ slide = function (f , ... , n = 7 , group_by , ref_time_values ,
266
271
time_step , new_col_name = " slide_value" ,
267
272
as_list_col = FALSE , names_sep = " _" ,
268
- all_rows = FALSE ) {
273
+ all_rows = FALSE ) {
269
274
# If missing, then set ref time values to be everything; else make
270
- # sure we intersect with observed time values
275
+ # sure we intersect with observed time values
271
276
if (missing(ref_time_values )) {
272
277
ref_time_values = unique(self $ DT $ time_value )
273
278
}
274
279
else {
275
280
ref_time_values = ref_time_values [ref_time_values %in%
276
281
unique(self $ DT $ time_value )]
277
282
}
278
-
279
- # If a custom time step is specified, then redefine units
283
+
284
+ # If a custom time step is specified, then redefine units
280
285
before_num = n - 1
281
286
if (! missing(time_step )) before_num = time_step(n - 1 )
282
-
287
+
283
288
# What to group by? If missing, set according to internal keys
284
289
if (missing(group_by )) {
285
290
group_by = setdiff(key(self $ DT ), c(" time_value" , " version" ))
286
291
}
287
-
292
+
288
293
# Symbolize column name, defuse grouping variables. We have to do
289
294
# the middle step here which is a bit complicated (unfortunately)
290
295
# since the function epix_slide() could have called the current one,
@@ -296,20 +301,20 @@ epi_archive =
296
301
297
302
# Key variable names, apart from time value and version
298
303
key_vars = setdiff(key(self $ DT ), c(" time_value" , " version" ))
299
-
304
+
300
305
# Computation for one group, one time value
301
306
comp_one_grp = function (.data_group ,
302
- f , ... ,
307
+ f , ... ,
303
308
time_value ,
304
309
key_vars ,
305
310
new_col ) {
306
- # Carry out the specified computation
311
+ # Carry out the specified computation
307
312
comp_value = f(.data_group , ... )
308
313
309
314
# Count the number of appearances of the reference time value.
310
315
# Note: ideally, we want to directly count occurrences of the ref
311
316
# time value but due to latency, this will often not appear in the
312
- # data group. So we count the number of unique key values, outside
317
+ # data group. So we count the number of unique key values, outside
313
318
# of the time value column
314
319
count = sum(! duplicated(.data_group [, key_vars ]))
315
320
@@ -343,23 +348,23 @@ epi_archive =
343
348
else {
344
349
Abort(" The slide computation must return an atomic vector or a data frame." )
345
350
}
346
-
351
+
347
352
# Note that we've already recycled comp value to make size stable,
348
353
# so tibble() will just recycle time value appropriately
349
- return (tibble :: tibble(time_value = time_value ,
354
+ return (tibble :: tibble(time_value = time_value ,
350
355
!! new_col : = comp_value ))
351
356
}
352
-
357
+
353
358
# If f is not missing, then just go ahead, slide by group
354
359
if (! missing(f )) {
355
360
if (rlang :: is_formula(f )) f = rlang :: as_function(f )
356
-
361
+
357
362
x = purrr :: map_dfr(ref_time_values , function (t ) {
358
363
self $ as_of(t , min_time_value = t - before_num ) %> %
359
- tibble :: as_tibble() %> %
364
+ tibble :: as_tibble() %> %
360
365
dplyr :: group_by(!!! group_by ) %> %
361
366
dplyr :: group_modify(comp_one_grp ,
362
- f = f , ... ,
367
+ f = f , ... ,
363
368
time_value = t ,
364
369
key_vars = key_vars ,
365
370
new_col = new_col ,
@@ -377,14 +382,14 @@ epi_archive =
377
382
if (length(quos ) > 1 ) {
378
383
Abort(" If `f` is missing then only a single computation can be specified via `...`." )
379
384
}
380
-
385
+
381
386
quo = quos [[1 ]]
382
387
f = function (x , quo , ... ) rlang :: eval_tidy(quo , x )
383
388
new_col = sym(names(rlang :: quos_auto_name(quos )))
384
389
385
390
x = purrr :: map_dfr(ref_time_values , function (t ) {
386
391
self $ as_of(t , min_time_value = t - before_num ) %> %
387
- tibble :: as_tibble() %> %
392
+ tibble :: as_tibble() %> %
388
393
dplyr :: group_by(!!! group_by ) %> %
389
394
dplyr :: group_modify(comp_one_grp ,
390
395
f = f , quo = quo ,
@@ -395,12 +400,12 @@ epi_archive =
395
400
dplyr :: ungroup()
396
401
})
397
402
}
398
-
403
+
399
404
# Unnest if we need to
400
405
if (! as_list_col ) {
401
406
x = tidyr :: unnest(x , !! new_col , names_sep = names_sep )
402
407
}
403
-
408
+
404
409
# Join to get all rows, if we need to, then return
405
410
if (all_rows ) {
406
411
cols = c(as.character(group_by ), " time_value" )
@@ -411,7 +416,7 @@ epi_archive =
411
416
}
412
417
)
413
418
)
414
-
419
+
415
420
# ' Convert to `epi_archive` format
416
421
# '
417
422
# ' Converts a data frame, data table, or tibble into an `epi_archive`
@@ -464,15 +469,15 @@ epi_archive =
464
469
# ' time_type = "day",
465
470
# ' other_keys = "county")
466
471
as_epi_archive = function (x , geo_type , time_type , other_keys ,
467
- additional_metadata = list ()) {
468
- epi_archive $ new(x , geo_type , time_type , other_keys , additional_metadata )
472
+ additional_metadata = list ()) {
473
+ epi_archive $ new(x , geo_type , time_type , other_keys , additional_metadata )
469
474
}
470
475
471
476
# ' Test for `epi_archive` format
472
477
# '
473
478
# ' @param x An object.
474
479
# ' @return `TRUE` if the object inherits from `epi_archive`.
475
- # '
480
+ # '
476
481
# ' @export
477
482
# ' @examples
478
483
# ' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive)
0 commit comments