Skip to content

Commit c18c911

Browse files
committed
suggestions from Dan
1 parent 51a1c6c commit c18c911

File tree

4 files changed

+40
-50
lines changed

4 files changed

+40
-50
lines changed

R/arx_forecaster.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function(
172172
r <- r %>%
173173
step_epi_naomit() %>%
174174
step_training_window(n_recent = args_list$n_training) %>%
175-
check_enough_data(all_predictors(), n = 1, skip = FALSE)
175+
check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE)
176176

177177
if (!is.null(args_list$check_enough_data_n)) {
178178
r <- r %>% check_enough_data(
179179
all_predictors(),
180180
all_outcomes(),
181-
n = args_list$check_enough_data_n,
181+
min_data_points = args_list$check_enough_data_n,
182182
epi_keys = args_list$check_enough_data_epi_keys,
183183
drop_na = FALSE
184184
)

R/check_enough_data.R

+17-27
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
#' @param ... One or more selector functions to choose variables for this check.
99
#' See [selections()] for more details. You will usually want to use
1010
#' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
11-
#' @param n The minimum number of data points required for training. If this is
12-
#' NULL, the total number of predictors will be used.
11+
#' @param min_data_points The minimum number of data points required for
12+
#' training. If this is NULL, the total number of predictors will be used.
1313
#' @param epi_keys A character vector of column names on which to group the data
1414
#' and check threshold within each group. Useful if your forecaster trains
1515
#' per group (for example, per geo_value).
@@ -18,8 +18,6 @@
1818
#' created.
1919
#' @param trained A logical for whether the selectors in `...`
2020
#' have been resolved by [prep()].
21-
#' @param columns An internal argument that tracks which columns are evaluated
22-
#' for this check. Should not be used by the user.
2321
#' @param id A character string that is unique to this check to identify it.
2422
#' @param skip A logical. If `TRUE`, only training data is checked, while if
2523
#' `FALSE`, both training and predicting data is checked. Technically, this
@@ -46,36 +44,36 @@
4644
check_enough_data <-
4745
function(recipe,
4846
...,
49-
n = NULL,
47+
min_data_points = NULL,
5048
epi_keys = NULL,
5149
drop_na = TRUE,
5250
role = NA,
5351
trained = FALSE,
54-
columns = NULL,
5552
skip = TRUE,
5653
id = rand_id("enough_data")) {
5754
recipes::add_check(
5855
recipe,
5956
check_enough_data_new(
60-
n = n,
57+
min_data_points = min_data_points,
6158
epi_keys = epi_keys,
6259
drop_na = drop_na,
6360
terms = enquos(...),
6461
role = role,
6562
trained = trained,
66-
columns = columns,
63+
columns = NULL,
6764
skip = skip,
6865
id = id
6966
)
7067
)
7168
}
7269

7370
check_enough_data_new <-
74-
function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) {
71+
function(min_data_points, epi_keys, drop_na, terms,
72+
role, trained, columns, skip, id) {
7573
recipes::check(
7674
subclass = "enough_data",
7775
prefix = "check_",
78-
n = n,
76+
min_data_points = min_data_points,
7977
epi_keys = epi_keys,
8078
drop_na = drop_na,
8179
terms = terms,
@@ -90,15 +88,12 @@ check_enough_data_new <-
9088
#' @export
9189
prep.check_enough_data <- function(x, training, info = NULL, ...) {
9290
col_names <- recipes::recipes_eval_select(x$terms, training, info)
93-
if (is.null(x$n)) {
94-
x$n <- length(col_names)
91+
if (is.null(x$min_data_points)) {
92+
x$min_data_points <- length(col_names)
9593
}
9694

97-
check_enough_data_core(training, x, col_names, "train")
98-
99-
10095
check_enough_data_new(
101-
n = x$n,
96+
min_data_points = x$min_data_points,
10297
epi_keys = x$epi_keys,
10398
drop_na = x$drop_na,
10499
terms = x$terms,
@@ -119,7 +114,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
119114

120115
#' @export
121116
print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
122-
title <- paste0("Check enough data (n = ", x$n, ") for ")
117+
title <- paste0("Check enough data (n = ", x$min_data_points, ") for ")
123118
recipes::print_step(x$columns, x$terms, x$trained, title, width)
124119
invisible(x)
125120
}
@@ -132,7 +127,7 @@ tidy.check_enough_data <- function(x, ...) {
132127
res <- tibble(terms = recipes::sel2char(x$terms))
133128
}
134129
res$id <- x$id
135-
res$n <- x$n
130+
res$min_data_points <- x$min_data_points
136131
res$epi_keys <- x$epi_keys
137132
res$drop_na <- x$drop_na
138133
res
@@ -145,18 +140,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
145140
any_missing_data <- epi_df %>%
146141
mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
147142
# count the number of rows where they're all not na
148-
summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop")
143+
summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop")
149144
any_missing_data <- any_missing_data %>%
150145
summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
151146
any()
152147

153-
# figuring out which individual columns (if any) are to blame for this darth
148+
# figuring out which individual columns (if any) are to blame for this dearth
154149
# of data
155150
cols_not_enough_data <- epi_df %>%
156151
summarise(
157152
across(
158153
all_of(.env$col_names),
159-
~ sum(!is.na(.x)) < .env$step_obj$n
154+
~ sum(!is.na(.x)) < .env$step_obj$min_data_points
160155
),
161156
.groups = "drop"
162157
) %>%
@@ -176,12 +171,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
176171
} else {
177172
# if we're not dropping na values, just count
178173
cols_not_enough_data <- epi_df %>%
179-
summarise(
180-
across(
181-
all_of(.env$col_names),
182-
~ dplyr::n() < .env$step_obj$n
183-
)
184-
)
174+
summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points))
185175
any_missing_data <- cols_not_enough_data %>%
186176
summarize(across(all_of(.env$col_names), all)) %>%
187177
all()

tests/testthat/_snaps/check_enough_data.md

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
11
# check_enough_data works on pooled data
22

33
Code
4-
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
5-
prep(toy_epi_df)
4+
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1,
5+
drop_na = FALSE) %>% prep(toy_epi_df)
66
Condition
77
Error in `check_enough_data_core()`:
88
! The following columns don't have enough data to train: x and y.
99

1010
---
1111

1212
Code
13-
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
14-
prep(toy_epi_df)
13+
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1,
14+
drop_na = TRUE) %>% prep(toy_epi_df)
1515
Condition
1616
Error in `check_enough_data_core()`:
1717
! The following columns don't have enough data to train: x.
1818

1919
# check_enough_data works on unpooled data
2020

2121
Code
22-
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
23-
drop_na = FALSE) %>% prep(toy_epi_df)
22+
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1,
23+
epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df)
2424
Condition
2525
Error in `check_enough_data_core()`:
2626
! The following columns don't have enough data to train: x and y.
2727

2828
---
2929

3030
Code
31-
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
32-
drop_na = TRUE) %>% prep(toy_epi_df)
31+
epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3,
32+
epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df)
3333
Condition
3434
Error in `check_enough_data_core()`:
3535
! The following columns don't have enough data to train: x and y.
@@ -47,7 +47,7 @@
4747

4848
Code
4949
epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
50-
all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df)
50+
all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df)
5151
Condition
5252
Error in `check_enough_data_core()`:
5353
! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.

tests/testthat/test-check_enough_data.R

+12-12
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,22 @@ test_that("check_enough_data works on pooled data", {
1818
# Check both columns have enough data
1919
expect_no_error(
2020
epi_recipe(toy_epi_df) %>%
21-
check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>%
21+
check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>%
2222
prep(toy_epi_df) %>%
2323
bake(new_data = NULL)
2424
)
2525
# Check both column don't have enough data
2626
expect_snapshot(
2727
error = TRUE,
2828
epi_recipe(toy_epi_df) %>%
29-
check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
29+
check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>%
3030
prep(toy_epi_df)
3131
)
3232
# Check drop_na works
3333
expect_snapshot(
3434
error = TRUE,
3535
epi_recipe(toy_epi_df) %>%
36-
check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
36+
check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>%
3737
prep(toy_epi_df)
3838
)
3939
})
@@ -42,30 +42,30 @@ test_that("check_enough_data works on unpooled data", {
4242
# Check both columns have enough data
4343
expect_no_error(
4444
epi_recipe(toy_epi_df) %>%
45-
check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>%
45+
check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>%
4646
prep(toy_epi_df) %>%
4747
bake(new_data = NULL)
4848
)
4949
# Check one column don't have enough data
5050
expect_snapshot(
5151
error = TRUE,
5252
epi_recipe(toy_epi_df) %>%
53-
check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
53+
check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
5454
prep(toy_epi_df)
5555
)
5656
# Check drop_na works
5757
expect_snapshot(
5858
error = TRUE,
5959
epi_recipe(toy_epi_df) %>%
60-
check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
60+
check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
6161
prep(toy_epi_df)
6262
)
6363
})
6464

6565
test_that("check_enough_data outputs the correct recipe values", {
6666
expect_no_error(
6767
p <- epi_recipe(toy_epi_df) %>%
68-
check_enough_data(x, y, n = 2 * n - 2) %>%
68+
check_enough_data(x, y, min_data_points = 2 * n - 2) %>%
6969
prep(toy_epi_df) %>%
7070
bake(new_data = NULL)
7171
)
@@ -90,15 +90,15 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
9090
epiprocess::as_epi_df()
9191
expect_no_error(
9292
epi_recipe(toy_epi_df) %>%
93-
check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
93+
check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>%
9494
prep(toy_epi_df) %>%
9595
bake(new_data = toy_test_data)
9696
)
9797
# Making sure `skip = TRUE` is working correctly in `predict`
9898
expect_no_error(
9999
epi_recipe(toy_epi_df) %>%
100100
add_role(y, new_role = "outcome") %>%
101-
check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>%
101+
check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>%
102102
epi_workflow(linear_reg()) %>%
103103
fit(toy_epi_df) %>%
104104
predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
@@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
108108
expect_no_error(
109109
forecaster <- epi_recipe(toy_epi_df) %>%
110110
add_role(y, new_role = "outcome") %>%
111-
check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>%
111+
check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>%
112112
epi_workflow(linear_reg()) %>%
113113
fit(toy_epi_df)
114114
)
@@ -125,15 +125,15 @@ test_that("check_enough_data works with all_predictors() downstream of construct
125125
expect_no_error(
126126
epi_recipe(toy_epi_df) %>%
127127
step_epi_lag(x, lag = c(1, 2)) %>%
128-
check_enough_data(all_predictors(), y, n = 2 * n - 5) %>%
128+
check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>%
129129
prep(toy_epi_df) %>%
130130
bake(new_data = NULL)
131131
)
132132
expect_snapshot(
133133
error = TRUE,
134134
epi_recipe(toy_epi_df) %>%
135135
step_epi_lag(x, lag = c(1, 2)) %>%
136-
check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
136+
check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>%
137137
prep(toy_epi_df)
138138
)
139139
})

0 commit comments

Comments
 (0)