@@ -50,27 +50,18 @@ get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns
5050 )
5151 }
5252 }
53+ max_time <- get_max_time(new_data , epi_keys_checked , columns )
5354 # the source data determines the actual time_values
54- # these are the non-na time_values;
55- # get the minimum value across the checked epi_keys' maximum time values
56- max_time <- new_data %> %
57- select(all_of(columns )) %> %
58- drop_na()
59- # null and "" don't work in `group_by`
60- if (! is.null(epi_keys_checked ) && (epi_keys_checked != " " )) {
61- max_time <- max_time %> % group_by(get(epi_keys_checked ))
62- }
63- max_time <- max_time %> %
64- summarise(time_value = max(time_value )) %> %
65- pull(time_value ) %> %
66- min()
6755 if (is.null(latency )) {
6856 forecast_date <- attributes(new_data )$ metadata $ as_of
6957 } else {
58+ if (is.null(max_time )) {
59+ cli_abort(" max_time is null. This likely means there is one of {columns} that is all `NA`" )
60+ }
7061 forecast_date <- max_time + latency
7162 }
7263 # make sure the as_of is sane
73- if (! inherits(forecast_date , class(max_time )) & ! inherits(forecast_date , " POSIXt" )) {
64+ if (! inherits(forecast_date , class(new_data $ time_value )) & ! inherits(forecast_date , " POSIXt" )) {
7465 cli_abort(
7566 paste(
7667 " the data matrix `forecast_date` value is {forecast_date}, " ,
@@ -84,13 +75,13 @@ get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns
8475 if (is.null(forecast_date ) || is.na(forecast_date )) {
8576 cli_warn(
8677 paste(
87- " epi_data's `forecast_date` was {forecast_date} , setting to " ,
88- " the latest time value, {max_time}."
78+ " epi_data's `forecast_date` was `NA` , setting to " ,
79+ " the latest non-`NA` time value for these columns , {max_time}."
8980 ),
9081 class = " epipredict__get_forecast_date__max_time_warning"
9182 )
9283 forecast_date <- max_time
93- } else if (forecast_date < max_time ) {
84+ } else if (! is.null( max_time ) && ( forecast_date < max_time ) ) {
9485 cli_abort(
9586 paste(
9687 " `forecast_date` ({(forecast_date)}) is before the most " ,
@@ -101,22 +92,49 @@ get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns
10192 )
10293 }
10394 # TODO cover the rest of the possible types for as_of and max_time...
104- if (inherits(max_time , " Date" )) {
95+ if (inherits(new_data $ time_value , " Date" )) {
10596 forecast_date <- as.Date(forecast_date )
10697 }
10798 return (forecast_date )
10899}
109100
101+ get_max_time <- function (new_data , epi_keys_checked , columns ) {
102+ # these are the non-na time_values;
103+ # get the minimum value across the checked epi_keys' maximum time values
104+ max_time <- new_data %> %
105+ select(all_of(columns )) %> %
106+ drop_na()
107+ if (nrow(max_time ) == 0 ) {
108+ return (NULL )
109+ }
110+ # null and "" don't work in `group_by`
111+ if (! is.null(epi_keys_checked ) && all(epi_keys_checked != " " )) {
112+ max_time <- max_time %> % group_by(across(all_of(epi_keys_checked )))
113+ }
114+ max_time <- max_time %> %
115+ summarise(time_value = max(time_value )) %> %
116+ pull(time_value ) %> %
117+ min()
118+ return (max_time )
119+ }
120+
121+
122+
110123# ' the latency is also the amount the shift is off by
111124# ' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you
112125# ' need to shift the data to bring the 3 day lagged value to today.
113126# ' @keywords internal
114127get_latency <- function (new_data , forecast_date , column , sign_shift , epi_keys_checked ) {
115128 shift_max_date <- new_data %> %
116129 drop_na(all_of(column ))
130+ if (nrow(shift_max_date ) == 0 ) {
131+ # if everything is an NA, there's infinite latency, but shifting by that is
132+ # untenable. May as well not shift at all
133+ return (0 )
134+ }
117135 # null and "" don't work in `group_by`
118- if (! is.null(epi_keys_checked ) && epi_keys_checked != " " ) {
119- shift_max_date <- shift_max_date %> % group_by(get( epi_keys_checked ))
136+ if (! is.null(epi_keys_checked ) && all( epi_keys_checked != " " ) ) {
137+ shift_max_date <- shift_max_date %> % group_by(across(all_of( epi_keys_checked ) ))
120138 }
121139 shift_max_date <- shift_max_date %> %
122140 summarise(time_value = max(time_value )) %> %
@@ -290,7 +308,8 @@ check_interminable_latency <- function(dataset, latency_table, target_columns, f
290308# ' @keywords internal
291309# ' @importFrom dplyr rowwise
292310get_latency_table <- function (training , columns , forecast_date , latency ,
293- sign_shift , epi_keys_checked , info , terms ) {
311+ sign_shift , epi_keys_checked , keys_to_ignore ,
312+ info , terms ) {
294313 if (is.null(columns )) {
295314 columns <- recipes_eval_select(terms , training , info )
296315 }
@@ -300,12 +319,17 @@ get_latency_table <- function(training, columns, forecast_date, latency,
300319 if (length(columns ) > 0 ) {
301320 latency_table <- latency_table %> % filter(col_name %in% columns )
302321 }
303-
322+ training_dropped <- training %> %
323+ drop_ignored_keys(keys_to_ignore )
304324 if (is.null(latency )) {
305325 latency_table <- latency_table %> %
306326 rowwise() %> %
307327 mutate(latency = get_latency(
308- training , forecast_date , col_name , sign_shift , epi_keys_checked
328+ training_dropped ,
329+ forecast_date ,
330+ col_name ,
331+ sign_shift ,
332+ epi_keys_checked
309333 ))
310334 } else if (length(latency ) > 1 ) {
311335 # if latency has a length, it must also have named elements.
@@ -319,7 +343,7 @@ get_latency_table <- function(training, columns, forecast_date, latency,
319343 latency_table <- latency_table %> %
320344 rowwise() %> %
321345 mutate(latency = get_latency(
322- training , forecast_date , col_name , sign_shift , epi_keys_checked
346+ training % > % drop_ignored_keys( keys_to_ignore ) , forecast_date , col_name , sign_shift , epi_keys_checked
323347 ))
324348 if (latency ) {
325349 latency_table <- latency_table %> % mutate(latency = latency )
@@ -328,6 +352,19 @@ get_latency_table <- function(training, columns, forecast_date, latency,
328352 return (latency_table %> % ungroup())
329353}
330354
355+ # ' given a list named by key columns, remove any matching key values
356+ # ' keys_to_ignore should have the form list(col_name = c("value_to_ignore", "other_value_to_ignore"))
357+ # ' @keywords internal
358+ drop_ignored_keys <- function (training , keys_to_ignore ) {
359+ # note that the extra parenthesis black magic is described here: https://github.com/tidyverse/dplyr/issues/6194
360+ # and is needed to bypass an incomplete port of `across` functions to `if_any`
361+ training %> %
362+ filter((dplyr :: if_all(
363+ names(keys_to_ignore ),
364+ ~ . %nin % keys_to_ignore [[cur_column()]]
365+ )))
366+ }
367+
331368
332369# ' checks: the recipe type, whether a previous step is the relevant epi_shift,
333370# ' that either `fixed_latency` or `fixed_forecast_date` is non-null, and that
@@ -394,7 +431,7 @@ compare_bake_prep_latencies <- function(object, new_data, call = caller_env()) {
394431 )
395432 local_latency_table <- get_latency_table(
396433 new_data , object $ columns , current_forecast_date , latency ,
397- get_sign(object ), object $ epi_keys_checked , NULL , NULL
434+ get_sign(object ), object $ epi_keys_checked , object $ keys_to_ignore , NULL , NULL
398435 )
399436 comparison_table <- local_latency_table %> %
400437 ungroup() %> %
0 commit comments