Test impl epi_slide with refactor tools

brookslogan · brookslogan · commit 036f6e2b630e · 2025-04-15T11:35:36.000-07:00
diff --git a/R/slide-refactor.R b/R/slide-refactor.R
@@ -146,6 +146,73 @@ upstream_slide_to_simple_hop <- function(.f, ..., .in_colnames, .out_colnames, .
   )
 }
 
+# ref_time_values_to_inp_ref_inds <- function(inp_tbl, ref_time_values) {
+#   matches <- vec_match(ref_time_values, inp_tbl$time_value)
+#   inp_ref_inds <- matches[!is.na(matches)]
+#   inp_ref_inds
+# }
+
+# complete_for_time_slide <- function(inp_tbl, inp_ref_inds, before_n_steps, after_n_steps) {
+#   if (before_n_steps == Inf) {
+#     # We need to get back to inp_tbl[1L,] from inp_tbl[min(inp_ref_inds),]
+#     start_padding <- min(inp_ref_inds) - 1L
+#   } else {
+#     start_padding <- before_n_steps
+#   }
+#   end_padding <- after_n_steps
+#   #
+
+#   slide_t_max <- out_t_max + after_n_steps * unit_step
+#   slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L
+#   slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step
+#   slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value)
+# }
+
+ref_time_values_to_out_time_values <- function(inp_tbl, ref_time_values) {
+  vec_set_intersect(inp_tbl$time_value, ref_time_values)
+}
+
+slide_window <- function(inp_tbl, epikey, simple_hop, before_n_steps, after_n_steps, unit_step, time_type, out_time_values) {
+  # TODO test whether origin time value stuff actually is helpful
+  origin_time_value <- inp_tbl$time_value[[1L]]
+  inp_ts <- time_minus_time_in_n_steps(inp_tbl$time_value, origin_time_value, time_type)
+  out_ts <- time_minus_time_in_n_steps(out_time_values, origin_time_value, time_type)
+  if (vec_size(out_ts) == 0L) {
+    stop("FIXME TODO")
+  } else {
+    slide_ts <- seq(min(out_ts) - before_n_steps, max(out_ts) + after_n_steps) # TODO compare min/max vs. `[[`
+  }
+  slide_inp_backrefs <- vec_match(slide_ts, inp_ts)
+  # TODO refactor to use a join if not using backrefs later anymore?
+  #
+  # TODO perf: try removing time_value column before slice?
+  slide_tbl <- vec_slice(inp_tbl, slide_inp_backrefs)
+  slide_tbl$time_value <- origin_time_value + slide_ts * unit_step
+
+  ref_inds <- vec_match(out_ts, slide_ts)
+  out_tbl <- simple_hop(slide_tbl, epikey, ref_inds)
+  out_tbl
+}
+
+
+
+# # We should filter down the slide time values to ones in the input time values
+#   # when preparing the output:
+#   rows_should_keep1 <- !is.na(slide_inp_backrefs)
+#   # We also need to apply the out_filter.
+#   #
+#   # TODO comments + test vs. just using inequality
+#   rows_should_keep2 <- switch(out_filter_time_style,
+#     range = vec_rep_each(
+#       c(FALSE, TRUE, FALSE),
+#       c(slide_start_padding_n, slide_nrow - slide_start_padding_n - after_n_steps, after_n_steps),
+#     ),
+#     set = vec_in(slide_time_values, out_time_values)
+#   )
+#   rows_should_keep <- rows_should_keep1 & rows_should_keep2
+#   out_tbl <- vec_slice(slide, rows_should_keep)
+#   out_tbl
+
 # TODO maybe make ref_inds optional or have special handling if it's the whole sequence?  But can it ever be the full sequence in the common fixed-width window case?  Should be some truncation of it.
 
 # TODO decide whether/where to put time range stuff
diff --git a/R/slide.R b/R/slide.R
@@ -261,58 +261,130 @@ epi_slide <- function(
   # Check for duplicated time values within groups
   assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
 
-  # Begin handling completion. This will create a complete time index between
-  # the smallest and largest time values in the data. This is used to ensure
-  # that the slide function is called with a complete window of data. Each slide
-  # group will filter this down to between its min and max time values. We also
-  # mark which dates were in the data and which were added by our completion.
-  date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type)
-  .x$.real <- TRUE
+  # # Begin handling completion. This will create a complete time index between
+  # # the smallest and largest time values in the data. This is used to ensure
+  # # that the slide function is called with a complete window of data. Each slide
+  # # group will filter this down to between its min and max time values. We also
+  # # mark which dates were in the data and which were added by our completion.
+  # date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type)
+  # .x$.real <- TRUE
 
-  # Create a wrapper that calculates and passes `.ref_time_value` to the
-  # computation. `i` is contained in the `slide_comp_wrapper_factory`
-  # environment such that when called within `slide_one_grp` `i` advances
-  # through the list of reference time values within a group and then resets
-  # back to 1 when switching groups.
-  slide_comp_wrapper_factory <- function(kept_ref_time_values) {
-    i <- 1L
-    slide_comp_wrapper <- function(.x, .group_key, ...) {
-      .ref_time_value <- kept_ref_time_values[[i]]
-      i <<- i + 1L
-      .slide_comp(.x, .group_key, .ref_time_value, ...)
+  # # Create a wrapper that calculates and passes `.ref_time_value` to the
+  # # computation. `i` is contained in the `slide_comp_wrapper_factory`
+  # # environment such that when called within `slide_one_grp` `i` advances
+  # # through the list of reference time values within a group and then resets
+  # # back to 1 when switching groups.
+  # slide_comp_wrapper_factory <- function(kept_ref_time_values) {
+  #   i <- 1L
+  #   slide_comp_wrapper <- function(.x, .group_key, ...) {
+  #     .ref_time_value <- kept_ref_time_values[[i]]
+  #     i <<- i + 1L
+  #     .slide_comp(.x, .group_key, .ref_time_value, ...)
+  #   }
+  #   slide_comp_wrapper
+  # }
+
+  # # - If .x is not grouped, then the trivial group is applied:
+  # #   https://dplyr.tidyverse.org/reference/group_map.html
+  # # - We create a lambda that forwards the necessary slide arguments to
+  # #   `epi_slide_one_group`.
+  # # - `...` from top of `epi_slide` are forwarded to `.f` here through
+  # #   group_modify and through the lambda.
+  # result <- group_map(
+  #   .x,
+  #   .f = function(.data_group, .group_key, ...) {
+  #     epi_slide_one_group(
+  #       .data_group, .group_key, ...,
+  #       .slide_comp_factory = slide_comp_wrapper_factory,
+  #       .before = window_args$before,
+  #       .after = window_args$after,
+  #       .ref_time_values = .ref_time_values,
+  #       .all_rows = .all_rows,
+  #       .new_col_name = .new_col_name,
+  #       .used_data_masking = used_data_masking,
+  #       .time_type = time_type,
+  #       .date_seq_list = date_seq_list
+  #     )
+  #   },
+  #   ...,
+  #   .keep = TRUE
+  # ) %>%
+  #   list_rbind() %>%
+  #   `[`(.$.real, names(.) != ".real") %>%
+  #   arrange_col_canonical() %>%
+  #   group_by(!!!.x_orig_groups)
+  before_n_steps <- time_delta_to_n_steps(window_args$before, time_type)
+  after_n_steps <- time_delta_to_n_steps(window_args$after, time_type)
+  unit_step <- unit_time_delta(time_type, format = "fast")
+  simple_hop <- time_slide_to_simple_hop(.slide_comp = .slide_comp, ..., .before_n_steps = before_n_steps, .after_n_steps = after_n_steps)
+  result <- .x %>%
+    group_modify(function(grp_data, grp_key) {
+      out_time_values <- ref_time_values_to_out_time_values(grp_data, .ref_time_values)
+      res <- grp_data
+      slide_values <- slide_window(grp_data, grp_key, simple_hop, before_n_steps, after_n_steps, unit_step, time_type, out_time_values)
+      # FIXME check, de-dupe, simplify, refactor, ...
+      if (.all_rows) {
+        new_slide_values <- vec_cast(rep(NA, nrow(res)), slide_values)
+        vec_slice(new_slide_values, vec_match(out_time_values, res$time_value)) <- slide_values
+        slide_values <- new_slide_values
+      } else {
+        res <- vec_slice(res, vec_match(out_time_values, res$time_value))
+      }
+
+  if (is.null(.new_col_name)) {
+    if (inherits(slide_values, "data.frame")) {
+      # Sometimes slide_values can parrot back columns already in `res`; allow
+      # this, but balk if a column has the same name as one in `res` but a
+      # different value:
+      comp_nms <- names(slide_values)
+      overlaps_existing_names <- comp_nms %in% names(res)
+      for (comp_i in which(overlaps_existing_names)) {
+        if (!identical(slide_values[[comp_i]], res[[comp_nms[[comp_i]]]])) {
+          lines <- c(
+            cli::format_error(c(
+              "New column and old column clash",
+              "x" = "slide computation output included a
+                     {format_varname(comp_nms[[comp_i]])} column, but `.x` already had a
+                     {format_varname(comp_nms[[comp_i]])} column with differing values",
+              "Here are examples of differing values, where the grouping variables were
+               {format_tibble_row(.group_key)}:"
+            )),
+            capture.output(print(waldo::compare(
+              res[[comp_nms[[comp_i]]]], slide_values[[comp_i]],
+              x_arg = rlang::expr_deparse(dplyr::expr(`$`(!!"existing", !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
+              y_arg = rlang::expr_deparse(dplyr::expr(`$`(!!"comp_value", !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter
+            ))),
+            cli::format_message(c(
+              ">" = "You likely want to rename or remove this column from your slide
+                     computation's output, or debug why it has a different value."
+            ))
+          )
+          rlang::abort(paste(collapse = "\n", lines),
+            class = "epiprocess__epi_slide_output_vs_existing_column_conflict"
+          )
+        }
+      }
+      # Unpack into separate columns (without name prefix). If there are
+      # columns duplicating existing columns, de-dupe and order them as if they
+      # didn't exist in slide_values.
+      res <- dplyr::bind_cols(res, slide_values[!overlaps_existing_names])
+    } else {
+      # Apply default name (to vector or packed data.frame-type column):
+      if ("slide_value" %in% names(res)) {
+        cli_abort(c("Cannot guess a good column name for your output",
+          "x" = "`slide_value` already exists in `.x`",
+          ">" = "Please provide a `.new_col_name`."
+        ))
+      }
+      res[["slide_value"]] <- slide_values
     }
-    slide_comp_wrapper
+  } else {
+    # Vector or packed data.frame-type column (note: overlaps with existing
+    # column names should already be forbidden by earlier validation):
+    res[[.new_col_name]] <- slide_values
   }
-
-  # - If .x is not grouped, then the trivial group is applied:
-  #   https://dplyr.tidyverse.org/reference/group_map.html
-  # - We create a lambda that forwards the necessary slide arguments to
-  #   `epi_slide_one_group`.
-  # - `...` from top of `epi_slide` are forwarded to `.f` here through
-  #   group_modify and through the lambda.
-  result <- group_map(
-    .x,
-    .f = function(.data_group, .group_key, ...) {
-      epi_slide_one_group(
-        .data_group, .group_key, ...,
-        .slide_comp_factory = slide_comp_wrapper_factory,
-        .before = window_args$before,
-        .after = window_args$after,
-        .ref_time_values = .ref_time_values,
-        .all_rows = .all_rows,
-        .new_col_name = .new_col_name,
-        .used_data_masking = used_data_masking,
-        .time_type = time_type,
-        .date_seq_list = date_seq_list
-      )
-    },
-    ...,
-    .keep = TRUE
-  ) %>%
-    list_rbind() %>%
-    `[`(.$.real, names(.) != ".real") %>%
-    arrange_col_canonical() %>%
-    group_by(!!!.x_orig_groups)
+      res
+    })
 
   # If every group in epi_slide_one_group takes the
   # length(available_ref_time_values) == 0 branch then we end up here.