suggestions from Dan

dsweber2 · dsweber2 · commit c18c911d2eb5 · 2025-03-27T12:48:53.000-05:00
diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
@@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_data(all_predictors(), n = 1, skip = FALSE)
+    check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_data(
       all_predictors(),
       all_outcomes(),
-      n = args_list$check_enough_data_n,
+      min_data_points = args_list$check_enough_data_n,
       epi_keys = args_list$check_enough_data_epi_keys,
       drop_na = FALSE
     )
diff --git a/R/check_enough_data.R b/R/check_enough_data.R
@@ -8,8 +8,8 @@
 #' @param ... One or more selector functions to choose variables for this check.
 #'  See [selections()] for more details. You will usually want to use
 #'  [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
-#' @param n The minimum number of data points required for training. If this is
-#'   NULL, the total number of predictors will be used.
+#' @param min_data_points The minimum number of data points required for
+#'   training. If this is NULL, the total number of predictors will be used.
 #' @param epi_keys A character vector of column names on which to group the data
 #'   and check threshold within each group. Useful if your forecaster trains
 #'   per group (for example, per geo_value).
@@ -18,8 +18,6 @@
 #'  created.
 #' @param trained A logical for whether the selectors in `...`
 #' have been resolved by [prep()].
-#' @param columns An internal argument that tracks which columns are evaluated
-#'   for this check. Should not be used by the user.
 #' @param id A character string that is unique to this check to identify it.
 #' @param skip A logical. If `TRUE`, only training data is checked, while if
 #'   `FALSE`, both training and predicting data is checked. Technically, this
@@ -46,36 +44,36 @@
 check_enough_data <-
   function(recipe,
            ...,
-           n = NULL,
+           min_data_points = NULL,
            epi_keys = NULL,
            drop_na = TRUE,
            role = NA,
            trained = FALSE,
-           columns = NULL,
            skip = TRUE,
            id = rand_id("enough_data")) {
     recipes::add_check(
       recipe,
       check_enough_data_new(
-        n = n,
+        min_data_points = min_data_points,
         epi_keys = epi_keys,
         drop_na = drop_na,
         terms = enquos(...),
         role = role,
         trained = trained,
-        columns = columns,
+        columns = NULL,
         skip = skip,
         id = id
       )
     )
   }
 
 check_enough_data_new <-
-  function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) {
+  function(min_data_points, epi_keys, drop_na, terms,
+           role, trained, columns, skip, id) {
     recipes::check(
       subclass = "enough_data",
       prefix = "check_",
-      n = n,
+      min_data_points = min_data_points,
       epi_keys = epi_keys,
       drop_na = drop_na,
       terms = terms,
@@ -90,15 +88,12 @@ check_enough_data_new <-
 #' @export
 prep.check_enough_data <- function(x, training, info = NULL, ...) {
   col_names <- recipes::recipes_eval_select(x$terms, training, info)
-  if (is.null(x$n)) {
-    x$n <- length(col_names)
+  if (is.null(x$min_data_points)) {
+    x$min_data_points <- length(col_names)
   }
 
-  check_enough_data_core(training, x, col_names, "train")
-
-
   check_enough_data_new(
-    n = x$n,
+    min_data_points = x$min_data_points,
     epi_keys = x$epi_keys,
     drop_na = x$drop_na,
     terms = x$terms,
@@ -119,7 +114,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
 
 #' @export
 print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
-  title <- paste0("Check enough data (n = ", x$n, ") for ")
+  title <- paste0("Check enough data (n = ", x$min_data_points, ") for ")
   recipes::print_step(x$columns, x$terms, x$trained, title, width)
   invisible(x)
 }
@@ -132,7 +127,7 @@ tidy.check_enough_data <- function(x, ...) {
     res <- tibble(terms = recipes::sel2char(x$terms))
   }
   res$id <- x$id
-  res$n <- x$n
+  res$min_data_points <- x$min_data_points
   res$epi_keys <- x$epi_keys
   res$drop_na <- x$drop_na
   res
@@ -145,18 +140,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
     any_missing_data <- epi_df %>%
       mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
       # count the number of rows where they're all not na
-      summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop")
+      summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop")
     any_missing_data <- any_missing_data %>%
       summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
       any()
 
-    # figuring out which individual columns (if any) are to blame for this darth
+    # figuring out which individual columns (if any) are to blame for this dearth
     # of data
     cols_not_enough_data <- epi_df %>%
       summarise(
         across(
           all_of(.env$col_names),
-          ~ sum(!is.na(.x)) < .env$step_obj$n
+          ~ sum(!is.na(.x)) < .env$step_obj$min_data_points
         ),
         .groups = "drop"
       ) %>%
@@ -176,12 +171,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
   } else {
     # if we're not dropping na values, just count
     cols_not_enough_data <- epi_df %>%
-      summarise(
-        across(
-          all_of(.env$col_names),
-          ~ dplyr::n() < .env$step_obj$n
-        )
-      )
+      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points))
     any_missing_data <- cols_not_enough_data %>%
       summarize(across(all_of(.env$col_names), all)) %>%
       all()
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
@@ -1,35 +1,35 @@
 # check_enough_data works on pooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-        prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1,
+      drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-        prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1,
+      drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x.
 
 # check_enough_data works on unpooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
-      drop_na = FALSE) %>% prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1,
+      epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
-      drop_na = TRUE) %>% prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3,
+      epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
@@ -47,7 +47,7 @@
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df)
+        all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
@@ -18,22 +18,22 @@ test_that("check_enough_data works on pooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   # Check both column don't have enough data
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -42,30 +42,30 @@ test_that("check_enough_data works on unpooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   # Check one column don't have enough data
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
 
 test_that("check_enough_data outputs the correct recipe values", {
   expect_no_error(
     p <- epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 2) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 2) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -90,15 +90,15 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
   # Making sure `skip = TRUE` is working correctly in `predict`
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df) %>%
       predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
@@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     forecaster <- epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df)
   )
@@ -125,15 +125,15 @@ test_that("check_enough_data works with all_predictors() downstream of construct
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, n = 2 * n - 5) %>%
+      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
+      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>%
       prep(toy_epi_df)
   )
 })