8
8
# ' @param ... One or more selector functions to choose variables for this check.
9
9
# ' See [selections()] for more details. You will usually want to use
10
10
# ' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
11
- # ' @param n The minimum number of data points required for training. If this is
12
- # ' NULL, the total number of predictors will be used.
11
+ # ' @param min_data_points The minimum number of data points required for
12
+ # ' training. If this is NULL, the total number of predictors will be used.
13
13
# ' @param epi_keys A character vector of column names on which to group the data
14
14
# ' and check threshold within each group. Useful if your forecaster trains
15
15
# ' per group (for example, per geo_value).
18
18
# ' created.
19
19
# ' @param trained A logical for whether the selectors in `...`
20
20
# ' have been resolved by [prep()].
21
- # ' @param columns An internal argument that tracks which columns are evaluated
22
- # ' for this check. Should not be used by the user.
23
21
# ' @param id A character string that is unique to this check to identify it.
24
22
# ' @param skip A logical. If `TRUE`, only training data is checked, while if
25
23
# ' `FALSE`, both training and predicting data is checked. Technically, this
46
44
check_enough_data <-
47
45
function (recipe ,
48
46
... ,
49
- n = NULL ,
47
+ min_data_points = NULL ,
50
48
epi_keys = NULL ,
51
49
drop_na = TRUE ,
52
50
role = NA ,
53
51
trained = FALSE ,
54
- columns = NULL ,
55
52
skip = TRUE ,
56
53
id = rand_id(" enough_data" )) {
57
54
recipes :: add_check(
58
55
recipe ,
59
56
check_enough_data_new(
60
- n = n ,
57
+ min_data_points = min_data_points ,
61
58
epi_keys = epi_keys ,
62
59
drop_na = drop_na ,
63
60
terms = enquos(... ),
64
61
role = role ,
65
62
trained = trained ,
66
- columns = columns ,
63
+ columns = NULL ,
67
64
skip = skip ,
68
65
id = id
69
66
)
70
67
)
71
68
}
72
69
73
70
check_enough_data_new <-
74
- function (n , epi_keys , drop_na , terms , role , trained , columns , skip , id ) {
71
+ function (min_data_points , epi_keys , drop_na , terms ,
72
+ role , trained , columns , skip , id ) {
75
73
recipes :: check(
76
74
subclass = " enough_data" ,
77
75
prefix = " check_" ,
78
- n = n ,
76
+ min_data_points = min_data_points ,
79
77
epi_keys = epi_keys ,
80
78
drop_na = drop_na ,
81
79
terms = terms ,
@@ -90,15 +88,12 @@ check_enough_data_new <-
90
88
# ' @export
91
89
prep.check_enough_data <- function (x , training , info = NULL , ... ) {
92
90
col_names <- recipes :: recipes_eval_select(x $ terms , training , info )
93
- if (is.null(x $ n )) {
94
- x $ n <- length(col_names )
91
+ if (is.null(x $ min_data_points )) {
92
+ x $ min_data_points <- length(col_names )
95
93
}
96
94
97
- check_enough_data_core(training , x , col_names , " train" )
98
-
99
-
100
95
check_enough_data_new(
101
- n = x $ n ,
96
+ min_data_points = x $ min_data_points ,
102
97
epi_keys = x $ epi_keys ,
103
98
drop_na = x $ drop_na ,
104
99
terms = x $ terms ,
@@ -119,7 +114,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
119
114
120
115
# ' @export
121
116
print.check_enough_data <- function (x , width = max(20 , options()$ width - 30 ), ... ) {
122
- title <- paste0(" Check enough data (n = " , x $ n , " ) for " )
117
+ title <- paste0(" Check enough data (n = " , x $ min_data_points , " ) for " )
123
118
recipes :: print_step(x $ columns , x $ terms , x $ trained , title , width )
124
119
invisible (x )
125
120
}
@@ -132,7 +127,7 @@ tidy.check_enough_data <- function(x, ...) {
132
127
res <- tibble(terms = recipes :: sel2char(x $ terms ))
133
128
}
134
129
res $ id <- x $ id
135
- res $ n <- x $ n
130
+ res $ min_data_points <- x $ min_data_points
136
131
res $ epi_keys <- x $ epi_keys
137
132
res $ drop_na <- x $ drop_na
138
133
res
@@ -145,18 +140,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
145
140
any_missing_data <- epi_df %> %
146
141
mutate(any_are_na = rowSums(across(any_of(.env $ col_names ), ~ is.na(.x ))) > 0 ) %> %
147
142
# count the number of rows where they're all not na
148
- summarise(sum(any_are_na == 0 ) < .env $ step_obj $ n , .groups = " drop" )
143
+ summarise(sum(any_are_na == 0 ) < .env $ step_obj $ min_data_points , .groups = " drop" )
149
144
any_missing_data <- any_missing_data %> %
150
145
summarize(across(all_of(setdiff(names(any_missing_data ), step_obj $ epi_keys )), any )) %> %
151
146
any()
152
147
153
- # figuring out which individual columns (if any) are to blame for this darth
148
+ # figuring out which individual columns (if any) are to blame for this dearth
154
149
# of data
155
150
cols_not_enough_data <- epi_df %> %
156
151
summarise(
157
152
across(
158
153
all_of(.env $ col_names ),
159
- ~ sum(! is.na(.x )) < .env $ step_obj $ n
154
+ ~ sum(! is.na(.x )) < .env $ step_obj $ min_data_points
160
155
),
161
156
.groups = " drop"
162
157
) %> %
@@ -176,12 +171,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
176
171
} else {
177
172
# if we're not dropping na values, just count
178
173
cols_not_enough_data <- epi_df %> %
179
- summarise(
180
- across(
181
- all_of(.env $ col_names ),
182
- ~ dplyr :: n() < .env $ step_obj $ n
183
- )
184
- )
174
+ summarise(across(all_of(.env $ col_names ), ~ dplyr :: n() < .env $ step_obj $ min_data_points ))
185
175
any_missing_data <- cols_not_enough_data %> %
186
176
summarize(across(all_of(.env $ col_names ), all )) %> %
187
177
all()
0 commit comments