Skip to content

Commit bf55134

Browse files
committed
feat: simplify daily_to_weekly_archive
* dont use epi_slide, group instead * it's faster and simpler
1 parent 2c0768b commit bf55134

File tree

3 files changed

+21
-65
lines changed

3 files changed

+21
-65
lines changed

Makefile

+6-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ test:
1111
run:
1212
Rscript scripts/run.R
1313

14+
run-nohup:
15+
nohup Rscript scripts/run.R &
16+
17+
run-nohup-restarting:
18+
scripts/hardRestarting.sh &
19+
1420
prod-covid:
1521
export TAR_RUN_PROJECT=covid_hosp_prod; Rscript scripts/run.R
1622

@@ -65,12 +71,6 @@ get-nwss:
6571
python nwss_covid_export.py; \
6672
python nwss_influenza_export.py
6773

68-
run-nohup:
69-
nohup Rscript scripts/run.R &
70-
71-
run-nohup-restarting:
72-
scripts/hardRestarting.sh &
73-
7474
sync:
7575
Rscript -e "source('R/sync_aws.R'); sync_aws()"
7676

R/aux_data_utils.R

+15-56
Original file line numberDiff line numberDiff line change
@@ -213,13 +213,15 @@ daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), keys = "geo_v
213213
#' @param epi_arch the archive to aggregate.
214214
#' @param agg_columns the columns to aggregate.
215215
#' @param agg_method the method to use to aggregate the data, one of "sum" or "mean".
216-
#' @param day_of_week the day of the week to use as the reference day.
217-
#' @param day_of_week_end the day of the week to use as the end of the week.
216+
#' @param week_reference the day of the week to use as the reference day (Wednesday is default).
217+
#' Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
218+
#' @param week_start the day of the week to use as the start of the week (Sunday is default).
219+
#' Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
218220
daily_to_weekly_archive <- function(epi_arch,
219221
agg_columns,
220222
agg_method = c("sum", "mean"),
221-
day_of_week = 4L,
222-
day_of_week_end = 7L) {
223+
week_reference = 4L,
224+
week_start = 7L) {
223225
# How to aggregate the windowed data.
224226
agg_method <- arg_match(agg_method)
225227
# The columns we will later group by when aggregating.
@@ -230,67 +232,24 @@ daily_to_weekly_archive <- function(epi_arch,
230232
sort()
231233
# Choose a fast function to use to slide and aggregate.
232234
if (agg_method == "sum") {
233-
slide_fun <- epi_slide_sum
235+
# If the week is complete, this is equivalent to the sum. If the week is not
236+
# complete, this is equivalent to 7/(number of days in the week) * the sum,
237+
# which should be a decent approximation.
238+
agg_fun <- \(x) 7 * mean(x, na.rm = TRUE)
234239
} else if (agg_method == "mean") {
235-
slide_fun <- epi_slide_mean
240+
agg_fun <- \(x) mean(x, na.rm = TRUE)
236241
}
237242
# Slide over the versions and aggregate.
238243
epix_slide(
239244
epi_arch,
240245
.versions = ref_time_values,
241246
function(x, group_keys, ref_time) {
242-
# The last day of the week we will slide over.
243-
ref_time_last_week_end <- floor_date(ref_time, "week", day_of_week_end - 1)
244-
245-
# To find the days we will slide over, we need to find the first and last
246-
# complete weeks of data. Get the max and min times, and then find the
247-
# first and last complete weeks of data.
248-
min_time <- min(x$time_value)
249-
max_time <- max(x$time_value)
250-
251-
# Let's determine if the min and max times are in the same week.
252-
ceil_min_time <- ceiling_date(min_time, "week", week_start = day_of_week_end - 1)
253-
ceil_max_time <- ceiling_date(max_time, "week", week_start = day_of_week_end - 1)
254-
255-
# If they're not in the same week, this means we have at least one
256-
# complete week of data to slide over.
257-
if (ceil_min_time < ceil_max_time) {
258-
valid_slide_days <- seq.Date(
259-
from = ceiling_date(min_time, "week", week_start = day_of_week_end - 1),
260-
to = floor_date(max_time, "week", week_start = day_of_week_end - 1),
261-
by = 7L
262-
)
263-
} else {
264-
# This is the degenerate case, where we have about 1 week or less of
265-
# data. In this case, we opt to return nothing for two reasons:
266-
# 1. in most cases here, the data is incomplete for a single week,
267-
# 2. if the data is complete, a single week of data is not enough to
268-
# reasonably perform any kind of aggregation.
269-
return(tibble())
270-
}
271-
272-
# If the last day of the week is not the end of the week, add it to the
273-
# list of valid slide days (this will produce an incomplete slide, but
274-
# that's fine for us, since it should only be 1 day, historically.)
275-
if (wday(max_time) != day_of_week_end) {
276-
valid_slide_days <- c(valid_slide_days, max_time)
277-
}
278-
279247
# Slide over the days and aggregate.
280248
x %>%
281-
group_by(across(all_of(keys))) %>%
282-
slide_fun(
283-
agg_columns,
284-
.window_size = 7L,
285-
na.rm = TRUE,
286-
.ref_time_values = valid_slide_days
287-
) %>%
288-
select(-all_of(agg_columns)) %>%
289-
rename_with(~ gsub("slide_value_", "", .x)) %>%
290-
rename_with(~ gsub("_7dsum", "", .x)) %>%
291-
# Round all dates to reference day of the week. These will get
292-
# de-duplicated by compactify in as_epi_archive below.
293-
mutate(time_value = round_date(time_value, "week", day_of_week - 1)) %>%
249+
mutate(week_start = ceiling_date(time_value, "week", week_start = week_start)-1) %>%
250+
summarize(across(all_of(agg_columns), agg_fun), .by = all_of(c(keys, "week_start"))) %>%
251+
mutate(time_value = round_date(week_start, "week", week_reference - 1)) %>%
252+
select(-week_start) %>%
294253
as_tibble()
295254
}
296255
) %>%

R/forecasters/data_validation.R

-3
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,6 @@ confirm_sufficient_data <- function(epi_data, ahead, args_input, outcome, extra_
6868
# TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
6969
# this needs to be fixed in epipredict itself, see
7070
# https://github.com/cmu-delphi/epipredict/issues/106.
71-
if (identical(extra_sources, "")) {
72-
extra_sources <- character(0L)
73-
}
7471
has_no_last_nas <- epi_data %>%
7572
drop_na(c(!!outcome, !!!extra_sources)) %>%
7673
group_by(geo_value) %>%

0 commit comments

Comments
 (0)