From 55ddeee824e2b1106b5249ea14c78bddfbc465fa Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 10 Apr 2025 17:25:13 -0500
Subject: [PATCH 01/62] initial decreasing forecasters rmd

---
 scripts/reports/decreasing_forecasters.Rmd | 176 +++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 scripts/reports/decreasing_forecasters.Rmd

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
new file mode 100644
index 00000000..c87e9089
--- /dev/null
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -0,0 +1,176 @@
+---
+title: "Decreasing Forecasters"
+author: Delphi Forecast Team
+date: "`Sys.date()`"
+output:
+  html_document:
+    code_folding: show
+    toc: True
+    # self_contained: False
+    # lib_dir: libs
+params:
+  disease: "covid"
+  forecast_res: !r ""
+  forecast_date: !r ""
+  truth_data: !r ""
+---
+
+$$\\[.4in]$$
+
+```{r echo=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+knitr::opts_knit$set(root.dir = here::here())
+ggplot2::theme_set(ggplot2::theme_bw())
+source(here::here("R/load_all.R"))
+```
+
+Partially part of the retrospective from this year.
+For many of the direct forecasters, the forecast is strictly decreasing, even in the middle of the surge.
+This effect is most prominent in flu, but occurs somewhat in covid.
+We need to resolve the source of this.
+It is some combination of the data and the models used.
+
+This notebook depends on having successfully run the `flu_hosp_explore` targets pipeline to handle the creation of the basic dataset.
+Accordingly, you need `.Renviron` to include `TAR_PROJECT=flu_hosp_prod`.
+```{r}
+tar_make(joined_archive_data)
+joined_archive <- tar_read(joined_archive_data)
+hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
+```
+
+To avoid running too frequently, we'll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing.
+
+```{r}
+hhs_archive %>% epix_as_of_current() %>% filter(time_value > "2023-10-01") %>% autoplot(hhs) 
+hhs_gr <- hhs_archive %>%
+  epix_as_of_current() %>%
+  group_by(geo_value) %>%
+  mutate(gr_hhs = growth_rate(hhs)) %>%
+  filter(time_value > "2023-10-01")
+hhs_gr %>%
+  arrange(gr_hhs) %>%
+  drop_na() %>%
+  slice_max(gr_hhs) %>%
+  ungroup() %>%
+  group_by(time_value) %>%
+  summarize(nn = length(hhs))
+```
+
+So the peak is ~ 11/15
+
+```{r}
+forecast_date <- as.Date("2023-11-29")
+hhs_gr %>% autoplot(gr_hhs) +
+  geom_vline(aes(xintercept = forecast_date), lty = 2) +
+  labs(title = "growth rates")
+```
+
+And most locations are still increasing 2 weeks later on the 29th, so we'll use that
+
+# Some utility functions
+
+Since we don't really need to run the full pipeline to get forecasts from a single day and forecaster, we build a couple of functions for inspecting forecasts.
+```{r}
+forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 * 7) {
+  all_forecasts <- map(aheads, \(ahead) forecaster(epi_data, ahead)) %>% list_rbind()
+  all_forecasts
+}
+```
+
+Here's a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.
+```{r}
+plot_forecasts <- function(all_forecasts,
+                           geo_values,
+                           data_archive = hhs_archive,
+                           earliest_truth_data = NULL) {
+  if (is.null(earliest_truth_data)) {
+    earliest_truth_data <- all_forecasts$forecast_date[[1]] - as.difftime(365, units = "days")
+  }
+  # transform the archive to something useful for comparison
+  finalized_plotting <- data_archive %>%
+    epix_as_of_current() %>%
+    filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
+    as_tibble() %>%
+    mutate(forecast_date = time_value) %>%
+    filter(time_value >= earliest_truth_data)
+  all_forecasts %>% filter(geo_value %in% geo_values) %>%
+    pivot_wider(names_from = quantile, values_from = value) %>%
+    ggplot(aes(x = target_end_date, group = geo_value, fill = forecast_date)) +
+    geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), alpha = 0.4) +
+    geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), alpha = 0.6) +
+    geom_line(aes(y = `0.5`, color = forecast_date)) +
+    geom_line(
+      data = finalized_plotting, aes(x = time_value, y = hhs)) +
+    facet_wrap(~geo_value, scale = "free") +
+    theme(legend.position = "none")
+}
+```
+
+And a method to inspect whether things are increasing that isn't just the eyeball norm on a few of them.
+This calculates growth rates for each quantile and each location.
+```{r}
+get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, ...) {
+  if (is.null(quantiles)) {
+    quantiles <- forecasts$quantile %>% unique()
+  }
+  forecasts %>%
+    group_by(geo_value, quantile) %>%
+    filter(min(value) != max(value), quantile %in% quantiles) %>%
+    mutate(growth = growth_rate(value, ...)) %>%
+    filter(abs(growth) < outlier_bound)
+}
+```
+
+# Establishing the problem
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+plot_forecasts(all_forecasts, default_geos)
+```
+
+All the forecasts are going down rather than up, even though they have multiple weeks of data!
+More quantitatively, across all geos:
+```{r}
+basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(desc(growth))
+```
+The only places where the growth rate is positive are american samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
+As a histogram (each state is included 5 times, once per ahead):
+```{r}
+basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+## It goes away if we use very short windows
+If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training=3))
+plot_forecasts(all_short_forecasts, default_geos)
+```
+
+They're pretty jittery, but strictly decreasing they are not.
+And the corresponding growth rates:
+
+```{r}
+short_gr <- get_growth_rates(all_short_forecasts, quantiles = 0.5, method = "smooth_spline")
+short_gr %>% arrange(growth) %>% ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount of decrease.
+
+# Is it geo pooling?
+Let's see what happens if we restrict ourselves to training each geo separately.
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_geos <- hhs_forecast %>% distinct(geo_value) %>% pull(geo_value)
+hhs_forecast %>% filter(!is.na(hhs)) %>% group_by(geo_value) %>% summarize(n_points = n()) %>% arrange(n_points)
+all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead), epi_data = hhs_forecast %>% filter(geo_value == geo)))
+all_geos_forecasts %>% list_rbind() %>% plot_forecasts(default_geos)
+```
+
+And the phenomina is still happening

From 3e57aec029c115ffc664340a117c8aa510492425 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 10 Apr 2025 18:01:55 -0500
Subject: [PATCH 02/62] geo_pooled

---
 scripts/reports/decreasing_forecasters.Rmd | 38 +++++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index c87e9089..1a16a41a 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -141,7 +141,7 @@ More quantitatively, across all geos:
 basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
 basic_gr %>% arrange(desc(growth))
 ```
-The only places where the growth rate is positive are american samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
+The only places where the growth rate is positive are American samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
 As a histogram (each state is included 5 times, once per ahead):
 ```{r}
 basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
@@ -161,7 +161,7 @@ And the corresponding growth rates:
 short_gr <- get_growth_rates(all_short_forecasts, quantiles = 0.5, method = "smooth_spline")
 short_gr %>% arrange(growth) %>% ggplot(aes(x = growth)) + geom_histogram(bins = 300)
 ```
-So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount of decrease.
+So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount decreasing.
 
 # Is it geo pooling?
 Let's see what happens if we restrict ourselves to training each geo separately.
@@ -169,8 +169,36 @@ Let's see what happens if we restrict ourselves to training each geo separately.
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
 all_geos <- hhs_forecast %>% distinct(geo_value) %>% pull(geo_value)
 hhs_forecast %>% filter(!is.na(hhs)) %>% group_by(geo_value) %>% summarize(n_points = n()) %>% arrange(n_points)
-all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead), epi_data = hhs_forecast %>% filter(geo_value == geo)))
-all_geos_forecasts %>% list_rbind() %>% plot_forecasts(default_geos)
+all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
+all_geos_forecasts %>% plot_forecasts(default_geos)
 ```
 
-And the phenomina is still happening
+And the phenomena is still happening, at least for the default geos.
+Are most negative?
+
+```{r}
+geos_gr <- get_growth_rates(all_geos_forecasts, quantiles = 0.5, method = "smooth_spline")
+geos_gr %>% arrange(desc(growth))
+```
+This is at least more of a mixed bag, with plenty of states with positive growth.
+
+```{r}
+geos_gr %>% ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+But most have a negative growth.
+
+## How different is not geo pooling anyways?
+Well it is at least different; how exactly is hard to parse:
+```{r}
+all_geos_forecasts %>%
+  left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
+  mutate(value =  value_geo - value_joint) %>%
+  select(-value_geo, -value_joint) %>%
+  filter(geo_value %in% default_geos) %>%
+  ggplot(aes(x = target_end_date, group = geo_value)) +
+  geom_point(aes(y = value, color = quantile)) +
+  facet_wrap(~geo_value, scale = "free")
+```
+
+
+# Direct vs iterative forecasting

From f3bb95d58361e9408cf8ebb27a23673b99ef875a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 11:04:44 -0500
Subject: [PATCH 03/62] the problem is quantile regression

---
 scripts/reports/decreasing_forecasters.Rmd | 43 ++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 1a16a41a..db3b8c17 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -35,6 +35,7 @@ This effect is most prominent in flu, but occurs somewhat in covid.
 We need to resolve the source of this.
 It is some combination of the data and the models used.
 
+# Data setup
 This notebook depends on having successfully run the `flu_hosp_explore` targets pipeline to handle the creation of the basic dataset.
 Accordingly, you need `.Renviron` to include `TAR_PROJECT=flu_hosp_prod`.
 ```{r}
@@ -202,3 +203,45 @@ all_geos_forecasts %>%
 
 
 # Direct vs iterative forecasting
+> Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
+
+Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
+
+
+# Presence/absence of an intercept
+> Wonder if this could be due to using an intercept combined with broad geo and time pooling.
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+plot_forecasts(all_forecasts, default_geos)
+```
+
+# Does filtering by growth rate change it?
+
+# Does switching to a nonlinear engine change it?
+First, confirming that it happens for simple linear regression and not just quantile regression.
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+plot_forecasts(all_linear_forecasts, default_geos)
+```
+
+And just like that the problem is gone?
+
+```{r}
+basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(growth) %>% print(n=30)
+```
+
+Not particularly clear, so there's a mix of growth and decay.
+
+```{r}
+basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+The majority of growth rates are positive; there's some amount of negative predictions, but that is to be expected.
+
+There appears to be something about fitting quantiles specifically that is causing this problem.

From 9e126b90a26dfe2c3037d612f2b2981eadc19e9b Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 12:11:08 -0500
Subject: [PATCH 04/62] double population scaling, fixing made it worse

As in made the rate of decrease in the forecasts worse
---
 scripts/reports/decreasing_forecasters.Rmd | 80 ++++++++++++++++++----
 1 file changed, 68 insertions(+), 12 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index db3b8c17..a4cca620 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -44,10 +44,15 @@ joined_archive <- tar_read(joined_archive_data)
 hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
 ```
 
+The data in question
+
+```{r}
+hhs_archive %>% epix_as_of_current() %>% filter(time_value > "2023-10-01") %>% autoplot(hhs)
+```
+
 To avoid running too frequently, we'll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing.
 
 ```{r}
-hhs_archive %>% epix_as_of_current() %>% filter(time_value > "2023-10-01") %>% autoplot(hhs) 
 hhs_gr <- hhs_archive %>%
   epix_as_of_current() %>%
   group_by(geo_value) %>%
@@ -85,7 +90,7 @@ forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 *
 
 Here's a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.
 ```{r}
-plot_forecasts <- function(all_forecasts,
+plot_dec_forecasts <- function(all_forecasts,
                            geo_values,
                            data_archive = hhs_archive,
                            earliest_truth_data = NULL) {
@@ -131,9 +136,9 @@ get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, .
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead))
+all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
-plot_forecasts(all_forecasts, default_geos)
+plot_dec_forecasts(all_forecasts, default_geos)
 ```
 
 All the forecasts are going down rather than up, even though they have multiple weeks of data!
@@ -151,8 +156,8 @@ basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
 If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training=3))
-plot_forecasts(all_short_forecasts, default_geos)
+all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training=3, pop_scaling = FALSE))
+plot_dec_forecasts(all_short_forecasts, default_geos)
 ```
 
 They're pretty jittery, but strictly decreasing they are not.
@@ -170,8 +175,8 @@ Let's see what happens if we restrict ourselves to training each geo separately.
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
 all_geos <- hhs_forecast %>% distinct(geo_value) %>% pull(geo_value)
 hhs_forecast %>% filter(!is.na(hhs)) %>% group_by(geo_value) %>% summarize(n_points = n()) %>% arrange(n_points)
-all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
-all_geos_forecasts %>% plot_forecasts(default_geos)
+all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
+all_geos_forecasts %>% plot_dec_forecasts(default_geos)
 ```
 
 And the phenomena is still happening, at least for the default geos.
@@ -213,9 +218,9 @@ Somewhat hard to express, there's also something about the increasing staleness
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
+all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
-plot_forecasts(all_forecasts, default_geos)
+plot_dec_forecasts(all_forecasts, default_geos)
 ```
 
 # Does filtering by growth rate change it?
@@ -225,9 +230,9 @@ First, confirming that it happens for simple linear regression and not just quan
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
+all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
-plot_forecasts(all_linear_forecasts, default_geos)
+plot_dec_forecasts(all_linear_forecasts, default_geos)
 ```
 
 And just like that the problem is gone?
@@ -245,3 +250,54 @@ basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
 The majority of growth rates are positive; there's some amount of negative predictions, but that is to be expected.
 
 There appears to be something about fitting quantiles specifically that is causing this problem.
+## Fitting one quantile
+What if we just fit the median? This shouldn't change anything
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_one_q_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, quantile_levels = c(0.5), pop_scaling = FALSE))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+all_one_q_forecasts %>% ggplot(aes(x = target_end_date, y = value)) + geom_line() + facet_wrap(~geo_value, scale = "free")
+```
+
+And indeed it does not, most forecasts are still negative.
+
+## Stripping away to bare rq
+
+In the interest of simplifying as much as possible, lets just fit using rq directly.
+The problem is most acute in the furthest ahead, so lets use `ahead = 28`.
+
+```{r}
+rec <- epi_recipe(hhs_forecast) %>%
+  step_population_scaling(
+    hhs,
+    df = epidatasets::state_census,
+    df_pop_col = "pop",
+    create_new = FALSE,
+    rate_rescaling = 1e5,
+    by = c("geo_value" = "abbr")
+  ) %>%
+  step_adjust_latency(hhs, method = "extend_lags") %>%
+  step_epi_lag(hhs, lag = c(0,7,14)) %>%
+  step_epi_ahead(hhs, ahead = 28)
+```
+
+With the recipe in hand, we prepare the data, dealing with dropping `NA` values by hand since we're mostly bypassing recipes.
+
+```{r}
+fit_obj <- rec %>% prep(hhs_forecast) %>% bake(hhs_forecast) %>%
+  drop_na(starts_with("lag"), starts_with("ahead")) %>%
+  rq(ahead_28_hhs ~ lag_7_hhs + lag_14_hhs + lag_21_hhs, tau = 0.5, .)
+pred_data <- rec %>% prep(hhs_forecast) %>% bake(hhs_forecast) %>%
+  drop_na(starts_with("lag")) %>%
+  filter(time_value == max(time_value))
+predictions <- predict(fit_obj, pred_data)
+```
+
+And since we're not using layers, we'll have to undo population scaling
+```{r}
+tibble(geo_value = pred_data$geo_value, value = predictions) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value * pop/1e5) %>%
+  select(-fips, -name, -pop)
+```

From 8afc79fb926b88861ee1c7ac7470dac0f9a2be7a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 12:48:23 -0500
Subject: [PATCH 05/62] linear parenthetical, boost trees are fine

---
 scripts/reports/decreasing_forecasters.Rmd | 61 +++++++++++++++++++---
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index a4cca620..7465f033 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -78,6 +78,7 @@ hhs_gr %>% autoplot(gr_hhs) +
 
 And most locations are still increasing 2 weeks later on the 29th, so we'll use that
 
+Note that this is RATE DATA, and not count data, so `pop_scaling = FALSE` should be the default
 # Some utility functions
 
 Since we don't really need to run the full pipeline to get forecasts from a single day and forecaster, we build a couple of functions for inspecting forecasts.
@@ -207,6 +208,52 @@ all_geos_forecasts %>%
 ```
 
 
+# Does switching to a nonlinear engine change it?
+First, confirming that it happens for simple linear regression and not just quantile regression.
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+plot_dec_forecasts(all_linear_forecasts, default_geos)
+```
+
+Which appears to also have this problem (and very narrow quantiles in some locations).
+
+```{r}
+basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(desc(growth)) %>% print(n=30)
+```
+
+There's a least a good number of locations/aheads that are positive
+
+```{r}
+basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+But the majority is negative.
+
+## Boosted trees
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+
+all_boost_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = boost_tree(mode = "regression"), pop_scaling = FALSE))
+default_geos <- c("ca", "fl", "ny", "pa", "tx")
+plot_dec_forecasts(all_boost_forecasts, default_geos)
+```
+
+Boosted trees don't have the problem? Mostly?
+The forecasts aren't great, but at least they're not plummeting.
+The quantiles are garbage, but that's kind of to be expected.
+
+```{r}
+basic_gr <- get_growth_rates(all_boost_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+```
+
+Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
+
+
 # Direct vs iterative forecasting
 > Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
 
@@ -216,6 +263,7 @@ Somewhat hard to express, there's also something about the increasing staleness
 # Presence/absence of an intercept
 > Wonder if this could be due to using an intercept combined with broad geo and time pooling.
 
+first lets examine
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
 all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
@@ -225,24 +273,24 @@ plot_dec_forecasts(all_forecasts, default_geos)
 
 # Does filtering by growth rate change it?
 
-# Does switching to a nonlinear engine change it?
-First, confirming that it happens for simple linear regression and not just quantile regression.
+# Dividing by population squared
+This was an accidental find, but if we *do* population scale, the problem goes away.
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
+all_linear_scaled_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
-plot_dec_forecasts(all_linear_forecasts, default_geos)
+plot_dec_forecasts(all_linear_scaled_forecasts, default_geos)
 ```
 
 And just like that the problem is gone?
 
 ```{r}
-basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr <- get_growth_rates(all_linear_scaled_forecasts, quantiles = 0.5, method = "smooth_spline")
 basic_gr %>% arrange(growth) %>% print(n=30)
 ```
 
-Not particularly clear, so there's a mix of growth and decay.
+There's a mix of growth and decay.
 
 ```{r}
 basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
@@ -250,6 +298,7 @@ basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
 The majority of growth rates are positive; there's some amount of negative predictions, but that is to be expected.
 
 There appears to be something about fitting quantiles specifically that is causing this problem.
+
 ## Fitting one quantile
 What if we just fit the median? This shouldn't change anything
 

From 2e527582b9d8d9181e25b4487f6f60a84cd4dc84 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 13:55:42 -0500
Subject: [PATCH 06/62] moving things around, coefficient inspection

---
 scripts/reports/decreasing_forecasters.Rmd | 68 ++++++++++++++++++----
 1 file changed, 58 insertions(+), 10 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 7465f033..e4a492ea 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -254,23 +254,71 @@ basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
 Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
 
 
-# Direct vs iterative forecasting
-> Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
+# Inspecting the linear coefficients
 
-Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
+First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`. For the sake of reproducibility, we will make the steps by hand
 
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+linear_get_workflow <- function(ahead) {
+  preproc <- epi_recipe(hhs_forecast) %>%
+    step_population_scaling(
+      hhs,
+      df = epidatasets::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    ) %>%
+    step_adjust_latency(hhs, method = "extend_lags") %>%
+    step_epi_lag(hhs, lag = c(0,7,14)) %>%
+    step_epi_ahead(hhs, ahead = ahead) %>%
+    step_epi_naomit()
+  postproc <- frosting() %>%
+    layer_predict() %>%
+    layer_residual_quantiles(quantile_levels = covidhub_probs()) %>%
+    layer_threshold() %>%
+    layer_naomit() %>%
+    layer_add_target_date() %>%
+    layer_add_forecast_date()
+  workflow <- epi_workflow(preproc, linear_reg()) %>%
+    fit(hhs_forecast) %>%
+    add_frosting(postproc)
+  workflow
+}
+all_workflows <- map(0:4 * 7, linear_get_workflow)
+```
 
-# Presence/absence of an intercept
-> Wonder if this could be due to using an intercept combined with broad geo and time pooling.
+Starting with the largest ahead, the coefficients are
+```{r}
+workflows::extract_fit_parsnip(all_workflows[[5]])
+```
+
+So the intercept is actually positive (so it's not biased towards decreasing inherently), but the coefficients for the two lags are negative.
+Even including that, the `lag_7_hhs` coefficient is less than one, so regardless of the fact that two coefficients are negative it will de-facto always be below the original value.
+
+How about the zero ahead?
+Thanks to latency this is actually a one week forward projection, so we can't expect exactly just `lag_7_hhs` to be one and everything else zero.
 
-first lets examine
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
-default_geos <- c("ca", "fl", "ny", "pa", "tx")
-plot_dec_forecasts(all_forecasts, default_geos)
+workflows::extract_fit_parsnip(all_workflows[[1]])
 ```
 
+And that is surprisingly close to exactly the `lag_7_hhs` value. 
+If the signal were constant so far, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
+
+## Presence/absence of an intercept
+> Wonder if this could be due to using an intercept combined with broad geo and time pooling.
+
+This is a bit more annoying to try to implement, since it seems that parsnip doesn't support disabling the intercept outside the formula (so we'd have to do it in the recipe somehow).
+
+# Direct vs iterative forecasting
+> Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
+
+Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
+Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
+
+
 # Does filtering by growth rate change it?
 
 # Dividing by population squared

From 9cc9f39322b955a0f0c6e3418951f142fce8965d Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Fri, 11 Apr 2025 14:01:15 -0700
Subject: [PATCH 07/62] format and add as_of plots to fanplots

---
 scripts/reports/decreasing_forecasters.Rmd | 88 +++++++++++++++-------
 1 file changed, 61 insertions(+), 27 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index e4a492ea..de5bcc64 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -37,7 +37,7 @@ It is some combination of the data and the models used.
 
 # Data setup
 This notebook depends on having successfully run the `flu_hosp_explore` targets pipeline to handle the creation of the basic dataset.
-Accordingly, you need `.Renviron` to include `TAR_PROJECT=flu_hosp_prod`.
+Accordingly, you need `.Renviron` to include `TAR_PROJECT=flu_hosp_explore`.
 ```{r}
 tar_make(joined_archive_data)
 joined_archive <- tar_read(joined_archive_data)
@@ -47,7 +47,10 @@ hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
 The data in question
 
 ```{r}
-hhs_archive %>% epix_as_of_current() %>% filter(time_value > "2023-10-01") %>% autoplot(hhs)
+hhs_archive %>%
+  epix_as_of_current() %>%
+  filter(time_value > "2023-10-01") %>%
+  autoplot(hhs)
 ```
 
 To avoid running too frequently, we'll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing.
@@ -92,12 +95,18 @@ forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 *
 Here's a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.
 ```{r}
 plot_dec_forecasts <- function(all_forecasts,
-                           geo_values,
-                           data_archive = hhs_archive,
-                           earliest_truth_data = NULL) {
+                               geo_values,
+                               data_archive = hhs_archive,
+                               earliest_truth_data = NULL) {
   if (is.null(earliest_truth_data)) {
     earliest_truth_data <- all_forecasts$forecast_date[[1]] - as.difftime(365, units = "days")
   }
+  as_of_plotting <- data_archive %>%
+    epix_as_of(min(all_forecasts$forecast_date)) %>%
+    filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
+    as_tibble() %>%
+    mutate(forecast_date = time_value) %>%
+    filter(time_value >= earliest_truth_data)
   # transform the archive to something useful for comparison
   finalized_plotting <- data_archive %>%
     epix_as_of_current() %>%
@@ -105,14 +114,15 @@ plot_dec_forecasts <- function(all_forecasts,
     as_tibble() %>%
     mutate(forecast_date = time_value) %>%
     filter(time_value >= earliest_truth_data)
-  all_forecasts %>% filter(geo_value %in% geo_values) %>%
+  all_forecasts %>%
+    filter(geo_value %in% geo_values) %>%
     pivot_wider(names_from = quantile, values_from = value) %>%
     ggplot(aes(x = target_end_date, group = geo_value, fill = forecast_date)) +
     geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), alpha = 0.4) +
     geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), alpha = 0.6) +
     geom_line(aes(y = `0.5`, color = forecast_date)) +
-    geom_line(
-      data = finalized_plotting, aes(x = time_value, y = hhs)) +
+    geom_line(data = as_of_plotting, aes(x = time_value, y = hhs), color = "grey") +
+    geom_line(data = finalized_plotting, aes(x = time_value, y = hhs), color = "black") +
     facet_wrap(~geo_value, scale = "free") +
     theme(legend.position = "none")
 }
@@ -151,13 +161,14 @@ basic_gr %>% arrange(desc(growth))
 The only places where the growth rate is positive are American samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
 As a histogram (each state is included 5 times, once per ahead):
 ```{r}
-basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 ## It goes away if we use very short windows
 If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training=3, pop_scaling = FALSE))
+all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training = 3, pop_scaling = FALSE))
 plot_dec_forecasts(all_short_forecasts, default_geos)
 ```
 
@@ -166,7 +177,10 @@ And the corresponding growth rates:
 
 ```{r}
 short_gr <- get_growth_rates(all_short_forecasts, quantiles = 0.5, method = "smooth_spline")
-short_gr %>% arrange(growth) %>% ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+short_gr %>%
+  arrange(growth) %>%
+  ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount decreasing.
 
@@ -174,8 +188,14 @@ So on a day-over-day basis the growth rate is mostly increasing, with some stron
 Let's see what happens if we restrict ourselves to training each geo separately.
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_geos <- hhs_forecast %>% distinct(geo_value) %>% pull(geo_value)
-hhs_forecast %>% filter(!is.na(hhs)) %>% group_by(geo_value) %>% summarize(n_points = n()) %>% arrange(n_points)
+all_geos <- hhs_forecast %>%
+  distinct(geo_value) %>%
+  pull(geo_value)
+hhs_forecast %>%
+  filter(!is.na(hhs)) %>%
+  group_by(geo_value) %>%
+  summarize(n_points = n()) %>%
+  arrange(n_points)
 all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
 all_geos_forecasts %>% plot_dec_forecasts(default_geos)
 ```
@@ -190,7 +210,8 @@ geos_gr %>% arrange(desc(growth))
 This is at least more of a mixed bag, with plenty of states with positive growth.
 
 ```{r}
-geos_gr %>% ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+geos_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 But most have a negative growth.
 
@@ -199,7 +220,7 @@ Well it is at least different; how exactly is hard to parse:
 ```{r}
 all_geos_forecasts %>%
   left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
-  mutate(value =  value_geo - value_joint) %>%
+  mutate(value = value_geo - value_joint) %>%
   select(-value_geo, -value_joint) %>%
   filter(geo_value %in% default_geos) %>%
   ggplot(aes(x = target_end_date, group = geo_value)) +
@@ -222,13 +243,16 @@ Which appears to also have this problem (and very narrow quantiles in some locat
 
 ```{r}
 basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
-basic_gr %>% arrange(desc(growth)) %>% print(n=30)
+basic_gr %>%
+  arrange(desc(growth)) %>%
+  print(n = 30)
 ```
 
 There's a least a good number of locations/aheads that are positive
 
 ```{r}
-basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 But the majority is negative.
 
@@ -248,7 +272,8 @@ The quantiles are garbage, but that's kind of to be expected.
 
 ```{r}
 basic_gr <- get_growth_rates(all_boost_forecasts, quantiles = 0.5, method = "smooth_spline")
-basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 
 Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
@@ -271,7 +296,7 @@ linear_get_workflow <- function(ahead) {
       by = c("geo_value" = "abbr")
     ) %>%
     step_adjust_latency(hhs, method = "extend_lags") %>%
-    step_epi_lag(hhs, lag = c(0,7,14)) %>%
+    step_epi_lag(hhs, lag = c(0, 7, 14)) %>%
     step_epi_ahead(hhs, ahead = ahead) %>%
     step_epi_naomit()
   postproc <- frosting() %>%
@@ -304,7 +329,7 @@ Thanks to latency this is actually a one week forward projection, so we can't ex
 workflows::extract_fit_parsnip(all_workflows[[1]])
 ```
 
-And that is surprisingly close to exactly the `lag_7_hhs` value. 
+And that is surprisingly close to exactly the `lag_7_hhs` value.
 If the signal were constant so far, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
 
 ## Presence/absence of an intercept
@@ -335,13 +360,16 @@ And just like that the problem is gone?
 
 ```{r}
 basic_gr <- get_growth_rates(all_linear_scaled_forecasts, quantiles = 0.5, method = "smooth_spline")
-basic_gr %>% arrange(growth) %>% print(n=30)
+basic_gr %>%
+  arrange(growth) %>%
+  print(n = 30)
 ```
 
 There's a mix of growth and decay.
 
 ```{r}
-basic_gr %>%  ggplot(aes(x = growth)) + geom_histogram(bins = 300)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
 ```
 The majority of growth rates are positive; there's some amount of negative predictions, but that is to be expected.
 
@@ -354,7 +382,9 @@ What if we just fit the median? This shouldn't change anything
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
 all_one_q_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, quantile_levels = c(0.5), pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
-all_one_q_forecasts %>% ggplot(aes(x = target_end_date, y = value)) + geom_line() + facet_wrap(~geo_value, scale = "free")
+all_one_q_forecasts %>% ggplot(aes(x = target_end_date, y = value)) +
+  geom_line() +
+  facet_wrap(~geo_value, scale = "free")
 ```
 
 And indeed it does not, most forecasts are still negative.
@@ -375,17 +405,21 @@ rec <- epi_recipe(hhs_forecast) %>%
     by = c("geo_value" = "abbr")
   ) %>%
   step_adjust_latency(hhs, method = "extend_lags") %>%
-  step_epi_lag(hhs, lag = c(0,7,14)) %>%
+  step_epi_lag(hhs, lag = c(0, 7, 14)) %>%
   step_epi_ahead(hhs, ahead = 28)
 ```
 
 With the recipe in hand, we prepare the data, dealing with dropping `NA` values by hand since we're mostly bypassing recipes.
 
 ```{r}
-fit_obj <- rec %>% prep(hhs_forecast) %>% bake(hhs_forecast) %>%
+fit_obj <- rec %>%
+  prep(hhs_forecast) %>%
+  bake(hhs_forecast) %>%
   drop_na(starts_with("lag"), starts_with("ahead")) %>%
   rq(ahead_28_hhs ~ lag_7_hhs + lag_14_hhs + lag_21_hhs, tau = 0.5, .)
-pred_data <- rec %>% prep(hhs_forecast) %>% bake(hhs_forecast) %>%
+pred_data <- rec %>%
+  prep(hhs_forecast) %>%
+  bake(hhs_forecast) %>%
   drop_na(starts_with("lag")) %>%
   filter(time_value == max(time_value))
 predictions <- predict(fit_obj, pred_data)
@@ -395,6 +429,6 @@ And since we're not using layers, we'll have to undo population scaling
 ```{r}
 tibble(geo_value = pred_data$geo_value, value = predictions) %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
-  mutate(value = value * pop/1e5) %>%
+  mutate(value = value * pop / 1e5) %>%
   select(-fips, -name, -pop)
 ```

From 78387df03435cd51e1fcaf17d897996fbc45bef7 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 16:22:10 -0500
Subject: [PATCH 08/62] exploring the fit data in detail

---
 scripts/reports/decreasing_forecasters.Rmd | 133 ++++++++++++++++++++-
 1 file changed, 127 insertions(+), 6 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index de5bcc64..3477e08e 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -279,13 +279,39 @@ basic_gr %>% ggplot(aes(x = growth)) +
 Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
 
 
+# Does fitting only 1 lag change things?
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date) %>% filter(time_value > "2022-06-01")
+all_one_lag_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, lag = 0))
+all_one_lag_forecasts
+plot_dec_forecasts(all_one_lag_forecasts, default_geos)
+```
+
+Still constantly falling, unfortunately, and a surprisingly similar forecast.
+Sanity check that this is actually a different forecast:
+
+```{r}
+all_forecasts %>%
+  left_join(all_one_lag_forecasts, by = c("geo_value", "forecast_date", "target_end_date", "quantile")) %>%
+  mutate(value = value.x - value.y) %>%
+  summarize(net_diff = sum(abs(value)))
+```
+
+Which is indeed larger than zero
+
 # Inspecting the linear coefficients
 
-First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`. For the sake of reproducibility, we will make the steps by hand
+First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`. 
+For the sake of reproducibility, we will make the steps by hand.
+Note that I've tried this section with both filtering pre 2022 values and not, and the results are approximately the same.
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-linear_get_workflow <- function(ahead) {
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date) %>% filter(time_value > "2022-06-01")
+```
+
+```{r}
+linear_get_preproc <- function(ahead, lag = c(0,7,14)) {
   preproc <- epi_recipe(hhs_forecast) %>%
     step_population_scaling(
       hhs,
@@ -296,9 +322,12 @@ linear_get_workflow <- function(ahead) {
       by = c("geo_value" = "abbr")
     ) %>%
     step_adjust_latency(hhs, method = "extend_lags") %>%
-    step_epi_lag(hhs, lag = c(0, 7, 14)) %>%
+    step_epi_lag(hhs, lag = lag) %>%
     step_epi_ahead(hhs, ahead = ahead) %>%
     step_epi_naomit()
+}
+linear_get_workflow <- function(ahead, lag = c(0,7,14)) {
+  preproc <- linear_get_preproc(ahead, lag)
   postproc <- frosting() %>%
     layer_predict() %>%
     layer_residual_quantiles(quantile_levels = covidhub_probs()) %>%
@@ -329,14 +358,106 @@ Thanks to latency this is actually a one week forward projection, so we can't ex
 workflows::extract_fit_parsnip(all_workflows[[1]])
 ```
 
-And that is surprisingly close to exactly the `lag_7_hhs` value.
-If the signal were constant so far, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
+And that is surprisingly close to exactly the `lag_7_hhs` value. 
+If the signal were constant so far though, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
 
 ## Presence/absence of an intercept
 > Wonder if this could be due to using an intercept combined with broad geo and time pooling.
 
 This is a bit more annoying to try to implement, since it seems that parsnip doesn't support disabling the intercept outside the formula (so we'd have to do it in the recipe somehow).
 
+## Inspecting the actual data that is fit
+
+Given that something strange is going on with the data that we're fitting, it is worth plotting the data that we're fitting.
+Since visualizing a 4D vector is a pain, let's start with the `lag = 0` case, which still has similar behavior (and in fact is the dominant coefficient above anyways).
+The way to get the data is using prep and bake:
+
+```{r}
+preproc <- linear_get_preproc(0:4 * 7)
+fit_data <- preproc %>%
+  prep(hhs_forecast) %>% bake(hhs_forecast) %>%
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
+lm_fit <- lm(ahead_28_hhs ~ lag_7_hhs, fit_data)
+fit_data %>% drop_na(lag_7_hhs, ahead_28_hhs) %>% ggplot(aes(x = lag_7_hhs, y = ahead_28_hhs)) + geom_point() + geom_smooth(method = 'lm', formula = y~x) + annotate(label = sprintf("y = %.3f + %.3f x\nR² = %.2f", coef(lm_fit)[[1]], coef(lm_fit)[[2]], summary(lm_fit)$r.squared), geom = "text", x = 1.75, y = 1.75, size = 7) + geom_abline(intercept = 0, slope = 1)
+```
+
+which is... positive at least.
+More importantly though, the coefficient on x is less than one, so this still decays.
+
+Comparing with the coefficient for this ahead and only this lag:
+
+```{r}
+all_single_workflows <- map(0:4 * 7, \(ahead) linear_get_workflow(ahead, lag = 0))
+workflows::extract_fit_parsnip(all_single_workflows[[5]])
+```
+
+which is the same value, so it's the same fit (as we should expect).
+Making a similar plot across the values of head:
+
+```{r}
+plot_linear_data <- function(fit_data_long) {
+  fit_data_long %>%
+    drop_na(value) %>%
+    ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
+    geom_point() +
+    ggplot2::scale_color_viridis_c() +
+    geom_smooth(method = 'lm', formula = y~x) +
+    geom_abline(intercept = 0, slope = 1) +
+    facet_wrap(~ahead_value, scales = "free")
+}
+fit_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  plot_linear_data()
+```
+
+All of which have slope less than 1.
+There is a stronger correlation between the smaller aheads and the value, and the slope is much closer to one.
+Adding the color corresponding to the season week potentially gives us some idea of the problem; there is a large mass of values with a low slope and a low `epi_week`.
+If we crudely just cut out everything with an `epi_week` below 10:
+
+```{r}
+fit_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  filter(epi_week > 10) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  plot_linear_data()
+```
+
+which is better but still below one.
+For some reason most of the remaining points egregiously below the diagonal are in `epi_weeks` very near 50. If we filter out the last 4 weeks:
+
+```{r}
+fit_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  filter(epi_week > 10, epi_week < 48) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  plot_linear_data()
+```
+
+Then suddenly the slopes can be quite large!
+This is of course not a particularly principled way of selecting training data.
+
+## Aside: what if we fit on a log scale?
+
+I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph.
+This is significantly closer to a positive linear slope, though it is still negative.
+
+```{r}
+fit_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  drop_na(value) %>%
+  ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
+  geom_point() +
+  ggplot2::scale_color_viridis_c() +
+  geom_smooth(method = 'lm', formula = y~x) +
+  geom_abline(intercept = 0, slope = 1) +
+  scale_y_log10() +
+  scale_x_log10() +
+  facet_wrap(~ahead_value, scales = "free")
+```
+
 # Direct vs iterative forecasting
 > Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
 

From 05e4dc54b8b52cb787b645b549e09ba6435cecc5 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 11 Apr 2025 18:11:19 -0500
Subject: [PATCH 09/62] also doing this for covid

---
 scripts/reports/decreasing_forecasters.Rmd | 76 ++++++++++++++++++----
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 3477e08e..496e30ed 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -311,23 +311,15 @@ hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date) %>% filter(time_value
 ```
 
 ```{r}
-linear_get_preproc <- function(ahead, lag = c(0,7,14)) {
-  preproc <- epi_recipe(hhs_forecast) %>%
-    step_population_scaling(
-      hhs,
-      df = epidatasets::state_census,
-      df_pop_col = "pop",
-      create_new = FALSE,
-      rate_rescaling = 1e5,
-      by = c("geo_value" = "abbr")
-    ) %>%
+linear_get_preproc <- function(ahead, lag = c(0,7,14), in_data = hhs_forecast) {
+  preproc <- epi_recipe(in_data) %>%
     step_adjust_latency(hhs, method = "extend_lags") %>%
     step_epi_lag(hhs, lag = lag) %>%
     step_epi_ahead(hhs, ahead = ahead) %>%
     step_epi_naomit()
 }
-linear_get_workflow <- function(ahead, lag = c(0,7,14)) {
-  preproc <- linear_get_preproc(ahead, lag)
+linear_get_workflow <- function(ahead, lag = c(0,7,14), in_data = hhs_forecast) {
+  preproc <- linear_get_preproc(ahead, lag, in_data)
   postproc <- frosting() %>%
     layer_predict() %>%
     layer_residual_quantiles(quantile_levels = covidhub_probs()) %>%
@@ -336,7 +328,7 @@ linear_get_workflow <- function(ahead, lag = c(0,7,14)) {
     layer_add_target_date() %>%
     layer_add_forecast_date()
   workflow <- epi_workflow(preproc, linear_reg()) %>%
-    fit(hhs_forecast) %>%
+    fit(in_data) %>%
     add_frosting(postproc)
   workflow
 }
@@ -438,6 +430,62 @@ fit_data %>%
 Then suddenly the slopes can be quite large!
 This is of course not a particularly principled way of selecting training data.
 
+# Reproducing in the context of covid
+Constantly decreasing forecasters has been less of an issue in covid, so we should do a comparison.
+Since we're assuming the project is `flu_hosp_explore`, we have to directly access the covid archive.
+Covid is in counts, so first we convert to rates.
+
+```{r}
+hhs_covid_archive <-
+  qs2::qs_read(here::here("covid_hosp_explore/objects/hhs_archive"))$DT %>%
+    filter(time_value > "2022-06-01") %>% 
+    left_join(state_census, by = join_by(geo_value == abbr)) %>%
+    mutate(value = value / pop * 1e5) %>%
+    select(-fips, -name, -pop) %>%
+    rename(hhs = value) %>%
+    as_epi_archive()
+hhs_covid_forecast <- hhs_covid_archive %>% epix_as_of(forecast_date)
+hhs_covid_forecast %>% autoplot(hhs)
+hhs_covid_forecast %>% pull(hhs) %>% max
+```
+
+Forecasting using the same methods as the original problem
+```{r}
+all_covid_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_dat = hhs_covid_forecast)
+plot_dec_forecasts(all_covid_forecasts, default_geos, hhs_covid_archive)
+```
+
+Still sort of present!
+Let's take a look at what the forecast is actually seeing
+
+```{r}
+preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
+fit_covid_data <- preproc %>%
+  prep(hhs_covid_forecast) %>%
+  bake(hhs_covid_forecast) %>%
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
+fit_covid_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  plot_linear_data()
+```
+
+Which, other than some massive outliers, has similar trends (the slope decreases as we increase the ahead, along with the spread).
+
+And if we cut out the early `epi_weeks`?
+
+```{r}
+fit_covid_data %>%
+  mutate(epi_week = epiweek(time_value)) %>%
+  filter(epi_week > 10) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  plot_linear_data()
+```
+
+Which is marginally better.
+The problematic data is less clear here date-wise than in the case of flu; partly this is because the data covers a much larger range of values.
+Partly I suspect that this is also it's less seasonal.
+
 ## Aside: what if we fit on a log scale?
 
 I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph.
@@ -458,6 +506,8 @@ fit_data %>%
   facet_wrap(~ahead_value, scales = "free")
 ```
 
+# What if we restrict to `10<epi_week<48`
+This turned
 # Direct vs iterative forecasting
 > Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
 

From 19960bcfdd6c565036dd8c332027a738174b24b8 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Fri, 11 Apr 2025 18:51:49 -0700
Subject: [PATCH 10/62] reorg

---
 .gitignore                                 |   2 +
 reports/template.md                        |   1 +
 scripts/reports/decreasing_forecasters.Rmd | 272 ++++++++++++---------
 3 files changed, 160 insertions(+), 115 deletions(-)

diff --git a/.gitignore b/.gitignore
index a9a43334..412a0b00 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,5 @@ data/
 .nhsn_covid_cache.parquet
 .nhsn_flu_cache.parquet
 meta/
+**/unnamed-chunk*
+decreasing_forecasters_cache/
\ No newline at end of file
diff --git a/reports/template.md b/reports/template.md
index 10006a52..738be7df 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -10,6 +10,7 @@
 
 ## Exploration Reports
 
+- [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
 ### Flu
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 496e30ed..73e006bb 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -1,18 +1,14 @@
 ---
 title: "Decreasing Forecasters"
 author: Delphi Forecast Team
-date: "`Sys.date()`"
+date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
 output:
   html_document:
-    code_folding: show
-    toc: True
-    # self_contained: False
+    code_folding: hide
+    toc: true
     # lib_dir: libs
-params:
-  disease: "covid"
-  forecast_res: !r ""
-  forecast_date: !r ""
-  truth_data: !r ""
+editor_options:
+  chunk_output_type: console
 ---
 
 $$\\[.4in]$$
@@ -35,13 +31,18 @@ This effect is most prominent in flu, but occurs somewhat in covid.
 We need to resolve the source of this.
 It is some combination of the data and the models used.
 
-# Data setup
+# Setup
+
+## Data
+
 This notebook depends on having successfully run the `flu_hosp_explore` targets pipeline to handle the creation of the basic dataset.
-Accordingly, you need `.Renviron` to include `TAR_PROJECT=flu_hosp_explore`.
-```{r}
+
+```{r echo=FALSE}
+Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
 tar_make(joined_archive_data)
 joined_archive <- tar_read(joined_archive_data)
 hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
+train_value_min <- "2022-06-01"
 ```
 
 The data in question
@@ -82,9 +83,11 @@ hhs_gr %>% autoplot(gr_hhs) +
 And most locations are still increasing 2 weeks later on the 29th, so we'll use that
 
 Note that this is RATE DATA, and not count data, so `pop_scaling = FALSE` should be the default
-# Some utility functions
+
+## Some utility functions
 
 Since we don't really need to run the full pipeline to get forecasts from a single day and forecaster, we build a couple of functions for inspecting forecasts.
+
 ```{r}
 forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 * 7) {
   all_forecasts <- map(aheads, \(ahead) forecaster(epi_data, ahead)) %>% list_rbind()
@@ -93,6 +96,7 @@ forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 *
 ```
 
 Here's a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.
+
 ```{r}
 plot_dec_forecasts <- function(all_forecasts,
                                geo_values,
@@ -130,6 +134,7 @@ plot_dec_forecasts <- function(all_forecasts,
 
 And a method to inspect whether things are increasing that isn't just the eyeball norm on a few of them.
 This calculates growth rates for each quantile and each location.
+
 ```{r}
 get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, ...) {
   if (is.null(quantiles)) {
@@ -143,10 +148,12 @@ get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, .
 }
 ```
 
-# Establishing the problem
+## Establishing the problem
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
 plot_dec_forecasts(all_forecasts, default_geos)
@@ -158,16 +165,24 @@ More quantitatively, across all geos:
 basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
 basic_gr %>% arrange(desc(growth))
 ```
+
 The only places where the growth rate is positive are American samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
 As a histogram (each state is included 5 times, once per ahead):
+
 ```{r}
 basic_gr %>% ggplot(aes(x = growth)) +
   geom_histogram(bins = 300)
 ```
-## It goes away if we use very short windows
+
+# Model tweaks
+
+## Very short training windows remove the decreasing problem
+
 If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training = 3, pop_scaling = FALSE))
 plot_dec_forecasts(all_short_forecasts, default_geos)
 ```
@@ -184,56 +199,14 @@ short_gr %>%
 ```
 So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount decreasing.
 
-# Is it geo pooling?
-Let's see what happens if we restrict ourselves to training each geo separately.
-```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-all_geos <- hhs_forecast %>%
-  distinct(geo_value) %>%
-  pull(geo_value)
-hhs_forecast %>%
-  filter(!is.na(hhs)) %>%
-  group_by(geo_value) %>%
-  summarize(n_points = n()) %>%
-  arrange(n_points)
-all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
-all_geos_forecasts %>% plot_dec_forecasts(default_geos)
-```
-
-And the phenomena is still happening, at least for the default geos.
-Are most negative?
-
-```{r}
-geos_gr <- get_growth_rates(all_geos_forecasts, quantiles = 0.5, method = "smooth_spline")
-geos_gr %>% arrange(desc(growth))
-```
-This is at least more of a mixed bag, with plenty of states with positive growth.
-
-```{r}
-geos_gr %>% ggplot(aes(x = growth)) +
-  geom_histogram(bins = 300)
-```
-But most have a negative growth.
-
-## How different is not geo pooling anyways?
-Well it is at least different; how exactly is hard to parse:
-```{r}
-all_geos_forecasts %>%
-  left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
-  mutate(value = value_geo - value_joint) %>%
-  select(-value_geo, -value_joint) %>%
-  filter(geo_value %in% default_geos) %>%
-  ggplot(aes(x = target_end_date, group = geo_value)) +
-  geom_point(aes(y = value, color = quantile)) +
-  facet_wrap(~geo_value, scale = "free")
-```
-
+## Switching to linear regression slightly mitigates, but doesn't remove the problem
 
-# Does switching to a nonlinear engine change it?
 First, confirming that it happens for simple linear regression and not just quantile regression.
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
 plot_dec_forecasts(all_linear_forecasts, default_geos)
@@ -254,12 +227,17 @@ There's a least a good number of locations/aheads that are positive
 basic_gr %>% ggplot(aes(x = growth)) +
   geom_histogram(bins = 300)
 ```
+
 But the majority is negative.
 
-## Boosted trees
+## Switching to a nonlinear engine removes the decreasing problem
+
+### Boosted trees
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 
 all_boost_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = boost_tree(mode = "regression"), pop_scaling = FALSE))
 default_geos <- c("ca", "fl", "ny", "pa", "tx")
@@ -278,47 +256,48 @@ basic_gr %>% ggplot(aes(x = growth)) +
 
 Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
 
-
-# Does fitting only 1 lag change things?
+## Fitting only 1 lag does not change the problem
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date) %>% filter(time_value > "2022-06-01")
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 all_one_lag_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, lag = 0))
 all_one_lag_forecasts
 plot_dec_forecasts(all_one_lag_forecasts, default_geos)
 ```
 
 Still constantly falling, unfortunately, and a surprisingly similar forecast.
-Sanity check that this is actually a different forecast:
-
-```{r}
-all_forecasts %>%
-  left_join(all_one_lag_forecasts, by = c("geo_value", "forecast_date", "target_end_date", "quantile")) %>%
-  mutate(value = value.x - value.y) %>%
-  summarize(net_diff = sum(abs(value)))
-```
 
-Which is indeed larger than zero
+# Inspecting the linear model coefficients
 
-# Inspecting the linear coefficients
-
-First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`. 
+First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`.
 For the sake of reproducibility, we will make the steps by hand.
 Note that I've tried this section with both filtering pre 2022 values and not, and the results are approximately the same.
 
 ```{r}
-hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date) %>% filter(time_value > "2022-06-01")
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
 ```
 
 ```{r}
-linear_get_preproc <- function(ahead, lag = c(0,7,14), in_data = hhs_forecast) {
+linear_get_preproc <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast) {
   preproc <- epi_recipe(in_data) %>%
+    step_population_scaling(
+      hhs,
+      df = epidatasets::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    ) %>%
     step_adjust_latency(hhs, method = "extend_lags") %>%
     step_epi_lag(hhs, lag = lag) %>%
     step_epi_ahead(hhs, ahead = ahead) %>%
     step_epi_naomit()
 }
-linear_get_workflow <- function(ahead, lag = c(0,7,14), in_data = hhs_forecast) {
+linear_get_workflow <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast) {
   preproc <- linear_get_preproc(ahead, lag, in_data)
   postproc <- frosting() %>%
     layer_predict() %>%
@@ -336,6 +315,7 @@ all_workflows <- map(0:4 * 7, linear_get_workflow)
 ```
 
 Starting with the largest ahead, the coefficients are
+
 ```{r}
 workflows::extract_fit_parsnip(all_workflows[[5]])
 ```
@@ -350,15 +330,10 @@ Thanks to latency this is actually a one week forward projection, so we can't ex
 workflows::extract_fit_parsnip(all_workflows[[1]])
 ```
 
-And that is surprisingly close to exactly the `lag_7_hhs` value. 
+And that is surprisingly close to exactly the `lag_7_hhs` value.
 If the signal were constant so far though, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
 
-## Presence/absence of an intercept
-> Wonder if this could be due to using an intercept combined with broad geo and time pooling.
-
-This is a bit more annoying to try to implement, since it seems that parsnip doesn't support disabling the intercept outside the formula (so we'd have to do it in the recipe somehow).
-
-## Inspecting the actual data that is fit
+## Fitting simple linear models to the data yields coefficients less than one
 
 Given that something strange is going on with the data that we're fitting, it is worth plotting the data that we're fitting.
 Since visualizing a 4D vector is a pain, let's start with the `lag = 0` case, which still has similar behavior (and in fact is the dominant coefficient above anyways).
@@ -367,10 +342,23 @@ The way to get the data is using prep and bake:
 ```{r}
 preproc <- linear_get_preproc(0:4 * 7)
 fit_data <- preproc %>%
-  prep(hhs_forecast) %>% bake(hhs_forecast) %>%
-  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
-lm_fit <- lm(ahead_28_hhs ~ lag_7_hhs, fit_data)
-fit_data %>% drop_na(lag_7_hhs, ahead_28_hhs) %>% ggplot(aes(x = lag_7_hhs, y = ahead_28_hhs)) + geom_point() + geom_smooth(method = 'lm', formula = y~x) + annotate(label = sprintf("y = %.3f + %.3f x\nR² = %.2f", coef(lm_fit)[[1]], coef(lm_fit)[[2]], summary(lm_fit)$r.squared), geom = "text", x = 1.75, y = 1.75, size = 7) + geom_abline(intercept = 0, slope = 1)
+  prep(hhs_forecast) %>%
+  bake(hhs_forecast) %>%
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
+  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  drop_na(lag_7_hhs, value) %>%
+  arrange(geo_value, time_value, ahead_value) %>%
+  mutate(epi_week = epiweek(time_value))
+
+fit_data %>%
+  ggplot(aes(x = lag_7_hhs, y = value)) +
+  geom_hex(alpha = 0.5, bins = 30) +
+  geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
+  geom_smooth(method = "lm", formula = y ~ x) +
+  geom_abline(intercept = 0, slope = 1) +
+  facet_wrap(~ahead_value, scales = "free") +
+  scale_color_viridis_c() +
+  scale_fill_viridis_c(option = "magma", trans = "log10")
 ```
 
 which is... positive at least.
@@ -393,13 +381,11 @@ plot_linear_data <- function(fit_data_long) {
     ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
     geom_point() +
     ggplot2::scale_color_viridis_c() +
-    geom_smooth(method = 'lm', formula = y~x) +
+    geom_smooth(method = "lm", formula = y ~ x) +
     geom_abline(intercept = 0, slope = 1) +
     facet_wrap(~ahead_value, scales = "free")
 }
 fit_data %>%
-  mutate(epi_week = epiweek(time_value)) %>%
-  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
   plot_linear_data()
 ```
 
@@ -410,9 +396,7 @@ If we crudely just cut out everything with an `epi_week` below 10:
 
 ```{r}
 fit_data %>%
-  mutate(epi_week = epiweek(time_value)) %>%
   filter(epi_week > 10) %>%
-  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
   plot_linear_data()
 ```
 
@@ -421,9 +405,7 @@ For some reason most of the remaining points egregiously below the diagonal are
 
 ```{r}
 fit_data %>%
-  mutate(epi_week = epiweek(time_value)) %>%
   filter(epi_week > 10, epi_week < 48) %>%
-  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
   plot_linear_data()
 ```
 
@@ -431,6 +413,7 @@ Then suddenly the slopes can be quite large!
 This is of course not a particularly principled way of selecting training data.
 
 # Reproducing in the context of covid
+
 Constantly decreasing forecasters has been less of an issue in covid, so we should do a comparison.
 Since we're assuming the project is `flu_hosp_explore`, we have to directly access the covid archive.
 Covid is in counts, so first we convert to rates.
@@ -438,15 +421,17 @@ Covid is in counts, so first we convert to rates.
 ```{r}
 hhs_covid_archive <-
   qs2::qs_read(here::here("covid_hosp_explore/objects/hhs_archive"))$DT %>%
-    filter(time_value > "2022-06-01") %>% 
-    left_join(state_census, by = join_by(geo_value == abbr)) %>%
-    mutate(value = value / pop * 1e5) %>%
-    select(-fips, -name, -pop) %>%
-    rename(hhs = value) %>%
-    as_epi_archive()
+  filter(time_value > train_value_min) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value / pop * 1e5) %>%
+  select(-fips, -name, -pop) %>%
+  rename(hhs = value) %>%
+  as_epi_archive()
 hhs_covid_forecast <- hhs_covid_archive %>% epix_as_of(forecast_date)
 hhs_covid_forecast %>% autoplot(hhs)
-hhs_covid_forecast %>% pull(hhs) %>% max
+hhs_covid_forecast %>%
+  pull(hhs) %>%
+  max()
 ```
 
 Forecasting using the same methods as the original problem
@@ -493,13 +478,11 @@ This is significantly closer to a positive linear slope, though it is still nega
 
 ```{r}
 fit_data %>%
-  mutate(epi_week = epiweek(time_value)) %>%
-  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
   drop_na(value) %>%
   ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
   geom_point() +
   ggplot2::scale_color_viridis_c() +
-  geom_smooth(method = 'lm', formula = y~x) +
+  geom_smooth(method = "lm", formula = y ~ x) +
   geom_abline(intercept = 0, slope = 1) +
   scale_y_log10() +
   scale_x_log10() +
@@ -507,17 +490,26 @@ fit_data %>%
 ```
 
 # What if we restrict to `10<epi_week<48`
-This turned
+
+TODO.
+
 # Direct vs iterative forecasting
-> Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
+
+Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
 
 Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
 Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
 
-
 # Does filtering by growth rate change it?
 
-# Dividing by population squared
+TODO.
+
+# Appendix
+
+Some other investigations.
+
+## Dividing by population squared
+
 This was an accidental find, but if we *do* population scale, the problem goes away.
 
 ```{r}
@@ -547,6 +539,7 @@ The majority of growth rates are positive; there's some amount of negative predi
 There appears to be something about fitting quantiles specifically that is causing this problem.
 
 ## Fitting one quantile
+
 What if we just fit the median? This shouldn't change anything
 
 ```{r}
@@ -603,3 +596,52 @@ tibble(geo_value = pred_data$geo_value, value = predictions) %>%
   mutate(value = value * pop / 1e5) %>%
   select(-fips, -name, -pop)
 ```
+
+## Presence/absence of an intercept
+
+This is a bit more annoying to try to implement, since it seems that parsnip doesn't support disabling the intercept outside the formula (so we'd have to do it in the recipe somehow).
+
+## Is it geo pooling?
+
+TL;DR: No. The phenomena is still happening, at least for the default geos. Too
+slow to run regularly. Let's see what happens if we restrict ourselves to
+training each geo separately.
+
+```{r, eval = FALSE}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+all_geos <- hhs_forecast %>%
+  distinct(geo_value) %>%
+  pull(geo_value)
+hhs_forecast %>%
+  filter(!is.na(hhs)) %>%
+  group_by(geo_value) %>%
+  summarize(n_points = n()) %>%
+  arrange(n_points)
+all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
+all_geos_forecasts %>% plot_dec_forecasts(default_geos)
+```
+
+```{r, eval = FALSE}
+geos_gr <- get_growth_rates(all_geos_forecasts, quantiles = 0.5, method = "smooth_spline")
+geos_gr %>% arrange(desc(growth))
+```
+This is at least more of a mixed bag, with plenty of states with positive growth.
+
+```{r, eval = FALSE}
+geos_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
+```
+But most have a negative growth.
+
+## How different is not geo pooling anyways?
+Well it is at least different; how exactly is hard to parse:
+```{r, eval = FALSE}
+all_geos_forecasts %>%
+  left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
+  mutate(value = value_geo - value_joint) %>%
+  select(-value_geo, -value_joint) %>%
+  filter(geo_value %in% default_geos) %>%
+  ggplot(aes(x = target_end_date, group = geo_value)) +
+  geom_point(aes(y = value, color = quantile)) +
+  facet_wrap(~geo_value, scale = "free")
+```
\ No newline at end of file

From 4c85eff77442b3eb2f8092f832ea940ed7841b12 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Fri, 11 Apr 2025 21:47:42 -0700
Subject: [PATCH 11/62] tweaks

---
 scripts/reports/decreasing_forecasters.Rmd | 97 ++++++++++------------
 1 file changed, 43 insertions(+), 54 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 73e006bb..c13bcd10 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -341,7 +341,7 @@ The way to get the data is using prep and bake:
 
 ```{r}
 preproc <- linear_get_preproc(0:4 * 7)
-fit_data <- preproc %>%
+fit_data_long <- preproc %>%
   prep(hhs_forecast) %>%
   bake(hhs_forecast) %>%
   select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
@@ -349,16 +349,6 @@ fit_data <- preproc %>%
   drop_na(lag_7_hhs, value) %>%
   arrange(geo_value, time_value, ahead_value) %>%
   mutate(epi_week = epiweek(time_value))
-
-fit_data %>%
-  ggplot(aes(x = lag_7_hhs, y = value)) +
-  geom_hex(alpha = 0.5, bins = 30) +
-  geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
-  geom_smooth(method = "lm", formula = y ~ x) +
-  geom_abline(intercept = 0, slope = 1) +
-  facet_wrap(~ahead_value, scales = "free") +
-  scale_color_viridis_c() +
-  scale_fill_viridis_c(option = "magma", trans = "log10")
 ```
 
 which is... positive at least.
@@ -377,15 +367,16 @@ Making a similar plot across the values of head:
 ```{r}
 plot_linear_data <- function(fit_data_long) {
   fit_data_long %>%
-    drop_na(value) %>%
-    ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
-    geom_point() +
-    ggplot2::scale_color_viridis_c() +
+    ggplot(aes(x = lag_7_hhs, y = value)) +
+    geom_hex(alpha = 0.5, bins = 30) +
+    geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
     geom_smooth(method = "lm", formula = y ~ x) +
     geom_abline(intercept = 0, slope = 1) +
-    facet_wrap(~ahead_value, scales = "free")
+    facet_wrap(~ahead_value, scales = "free") +
+    scale_color_viridis_c() +
+    scale_fill_viridis_c(option = "magma", trans = "log10")
 }
-fit_data %>%
+fit_data_long %>%
   plot_linear_data()
 ```
 
@@ -395,7 +386,7 @@ Adding the color corresponding to the season week potentially gives us some idea
 If we crudely just cut out everything with an `epi_week` below 10:
 
 ```{r}
-fit_data %>%
+fit_data_long %>%
   filter(epi_week > 10) %>%
   plot_linear_data()
 ```
@@ -404,7 +395,7 @@ which is better but still below one.
 For some reason most of the remaining points egregiously below the diagonal are in `epi_weeks` very near 50. If we filter out the last 4 weeks:
 
 ```{r}
-fit_data %>%
+fit_data_long %>%
   filter(epi_week > 10, epi_week < 48) %>%
   plot_linear_data()
 ```
@@ -412,6 +403,25 @@ fit_data %>%
 Then suddenly the slopes can be quite large!
 This is of course not a particularly principled way of selecting training data.
 
+## Aside: what if we fit on a log scale?
+
+I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph.
+This is significantly closer to a positive linear slope, though it is still negative.
+
+```{r}
+fit_data_long %>%
+  ggplot(aes(x = lag_7_hhs, y = value)) +
+  geom_hex(alpha = 0.5, bins = 30) +
+  geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
+  geom_smooth(method = "lm", formula = y ~ x) +
+  geom_abline(intercept = 0, slope = 1) +
+  scale_y_log10() +
+  scale_x_log10() +
+  facet_wrap(~ahead_value, scales = "free") +
+  scale_color_viridis_c() +
+  scale_fill_viridis_c(option = "magma", trans = "log10")
+```
+
 # Reproducing in the context of covid
 
 Constantly decreasing forecasters has been less of an issue in covid, so we should do a comparison.
@@ -471,39 +481,6 @@ Which is marginally better.
 The problematic data is less clear here date-wise than in the case of flu; partly this is because the data covers a much larger range of values.
 Partly I suspect that this is also it's less seasonal.
 
-## Aside: what if we fit on a log scale?
-
-I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph.
-This is significantly closer to a positive linear slope, though it is still negative.
-
-```{r}
-fit_data %>%
-  drop_na(value) %>%
-  ggplot(aes(x = lag_7_hhs, y = value, color = epi_week)) +
-  geom_point() +
-  ggplot2::scale_color_viridis_c() +
-  geom_smooth(method = "lm", formula = y ~ x) +
-  geom_abline(intercept = 0, slope = 1) +
-  scale_y_log10() +
-  scale_x_log10() +
-  facet_wrap(~ahead_value, scales = "free")
-```
-
-# What if we restrict to `10<epi_week<48`
-
-TODO.
-
-# Direct vs iterative forecasting
-
-Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
-
-Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
-Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
-
-# Does filtering by growth rate change it?
-
-TODO.
-
 # Appendix
 
 Some other investigations.
@@ -633,7 +610,8 @@ geos_gr %>% ggplot(aes(x = growth)) +
 ```
 But most have a negative growth.
 
-## How different is not geo pooling anyways?
+### How different is not geo pooling anyways?
+
 Well it is at least different; how exactly is hard to parse:
 ```{r, eval = FALSE}
 all_geos_forecasts %>%
@@ -644,4 +622,15 @@ all_geos_forecasts %>%
   ggplot(aes(x = target_end_date, group = geo_value)) +
   geom_point(aes(y = value, color = quantile)) +
   facet_wrap(~geo_value, scale = "free")
-```
\ No newline at end of file
+```
+
+## Direct vs iterative forecasting
+
+Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
+
+Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
+Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
+
+## Does filtering by growth rate change it?
+
+TODO.
\ No newline at end of file

From e6f2b07091a1816450dee54b1d30674a88f6f8d7 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 14 Apr 2025 11:32:17 -0500
Subject: [PATCH 12/62] yet more tweaks

---
 renv.lock                                  | 52 +-----------
 renv/activate.R                            |  2 +-
 scripts/reports/decreasing_forecasters.Rmd | 97 ++++++++++++----------
 3 files changed, 58 insertions(+), 93 deletions(-)

diff --git a/renv.lock b/renv.lock
index c76fcb77..bde6fe4c 100644
--- a/renv.lock
+++ b/renv.lock
@@ -7413,56 +7413,8 @@
     },
     "renv": {
       "Package": "renv",
-      "Version": "1.1.3",
-      "Source": "Repository",
-      "Type": "Package",
-      "Title": "Project Environments",
-      "Authors@R": "c( person(\"Kevin\", \"Ushey\", role = c(\"aut\", \"cre\"), email = \"kevin@rstudio.com\", comment = c(ORCID = \"0000-0003-2880-7407\")), person(\"Hadley\", \"Wickham\", role = c(\"aut\"), email = \"hadley@rstudio.com\", comment = c(ORCID = \"0000-0003-4757-117X\")), person(\"Posit Software, PBC\", role = c(\"cph\", \"fnd\")) )",
-      "Description": "A dependency management toolkit for R. Using 'renv', you can create and manage project-local R libraries, save the state of these libraries to a 'lockfile', and later restore your library as required. Together, these tools can help make your projects more isolated, portable, and reproducible.",
-      "License": "MIT + file LICENSE",
-      "URL": "https://rstudio.github.io/renv/, https://github.com/rstudio/renv",
-      "BugReports": "https://github.com/rstudio/renv/issues",
-      "Imports": [
-        "utils"
-      ],
-      "Suggests": [
-        "BiocManager",
-        "cli",
-        "compiler",
-        "covr",
-        "cpp11",
-        "devtools",
-        "gitcreds",
-        "jsonlite",
-        "jsonvalidate",
-        "knitr",
-        "miniUI",
-        "modules",
-        "packrat",
-        "pak",
-        "R6",
-        "remotes",
-        "reticulate",
-        "rmarkdown",
-        "rstudioapi",
-        "shiny",
-        "testthat",
-        "uuid",
-        "waldo",
-        "yaml",
-        "webfakes"
-      ],
-      "Encoding": "UTF-8",
-      "RoxygenNote": "7.3.2",
-      "VignetteBuilder": "knitr",
-      "Config/Needs/website": "tidyverse/tidytemplate",
-      "Config/testthat/edition": "3",
-      "Config/testthat/parallel": "true",
-      "Config/testthat/start-first": "bioconductor,python,install,restore,snapshot,retrieve,remotes",
-      "NeedsCompilation": "no",
-      "Author": "Kevin Ushey [aut, cre] (<https://orcid.org/0000-0003-2880-7407>), Hadley Wickham [aut] (<https://orcid.org/0000-0003-4757-117X>), Posit Software, PBC [cph, fnd]",
-      "Maintainer": "Kevin Ushey <kevin@rstudio.com>",
-      "Repository": "RSPM"
+      "Version": "1.1.4",
+      "Source": "Repository"
     },
     "reprex": {
       "Package": "reprex",
diff --git a/renv/activate.R b/renv/activate.R
index 623cc4e7..90b251ca 100644
--- a/renv/activate.R
+++ b/renv/activate.R
@@ -2,7 +2,7 @@
 local({
 
   # the requested version of renv
-  version <- "1.1.3"
+  version <- "1.1.4"
   attr(version, "sha") <- NULL
 
   # the project directory
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index c13bcd10..400cba05 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -20,34 +20,46 @@ knitr::opts_chunk$set(
   warning = FALSE,
   cache = FALSE
 )
+options(
+  dplyr.print_min = 6,
+  dplyr.print_max = 6,
+  pillar.max_footer_lines = 2,
+  pillar.min_chars = 15,
+  stringr.view_n = 6,
+  pillar.bold = TRUE,
+  width = 100
+)
 knitr::opts_knit$set(root.dir = here::here())
 ggplot2::theme_set(ggplot2::theme_bw())
 source(here::here("R/load_all.R"))
 ```
 
-Partially part of the retrospective from this year.
 For many of the direct forecasters, the forecast is strictly decreasing, even in the middle of the surge.
 This effect is most prominent in flu, but occurs somewhat in covid.
 We need to resolve the source of this.
 It is some combination of the data and the models used.
 
+Roughly, the conclusion after having done these experiments is that it is in fact **an over-representation of decreasing `time_value`s that is causing this behavior** (especially look at [fitting simple linear models to the data](#fitting-simple-linear-models-to-the-data-yields-coefficients-less-than-one)).
+Some possible fixes include:
+
+1. Some method of data filtering or weighting based on the current growth rate.
+   The seasonal forecaster probably benefits from this effect.
+   An alternative would be to bring back some sort of classifier-type system (either using an actual classifier, or filtering to comparable growth rates).
+2. Improve our ability to use non-linear classifiers. 
+   These would likely be able to handle forecasting both increasing and decreasing signals, but would require a better method of generating quantiles. 
+   For example, by comparing a given forecast to the climate model, or a comparison with the trajectories that occur on comparable forecast dates (this brings us back to 1).
+
 # Setup
 
 ## Data
 
 This notebook depends on having successfully run the `flu_hosp_explore` targets pipeline to handle the creation of the basic dataset.
 
-```{r echo=FALSE}
+```{r results = "hide"}
 Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
-tar_make(joined_archive_data)
-joined_archive <- tar_read(joined_archive_data)
+tar_make(hhs_archive)
 hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
-train_value_min <- "2022-06-01"
-```
-
-The data in question
-
-```{r}
+train_value_min <- "2002-06-01"
 hhs_archive %>%
   epix_as_of_current() %>%
   filter(time_value > "2023-10-01") %>%
@@ -55,6 +67,7 @@ hhs_archive %>%
 ```
 
 To avoid running too frequently, we'll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing.
+Here's a table of the number of locations that have their max on any given week:
 
 ```{r}
 hhs_gr <- hhs_archive %>%
@@ -68,10 +81,11 @@ hhs_gr %>%
   slice_max(gr_hhs) %>%
   ungroup() %>%
   group_by(time_value) %>%
-  summarize(nn = length(hhs))
+  summarize(nn = length(hhs)) %>%
+  print(n=16)
 ```
 
-So the peak is ~ 11/15
+Since most have the largest growth rate on the 15th, so let's choose 11/29 as our `forecast_date`, to make sure there's some trend for the forecasters to pick up on.
 
 ```{r}
 forecast_date <- as.Date("2023-11-29")
@@ -80,9 +94,9 @@ hhs_gr %>% autoplot(gr_hhs) +
   labs(title = "growth rates")
 ```
 
-And most locations are still increasing 2 weeks later on the 29th, so we'll use that
+A plot to confirm that most  locations are still increasing on the 29th.
 
-Note that this is RATE DATA, and not count data, so `pop_scaling = FALSE` should be the default
+Note that this is RATE DATA, and not count data, so `pop_scaling = FALSE` should be the default.
 
 ## Some utility functions
 
@@ -160,20 +174,16 @@ plot_dec_forecasts(all_forecasts, default_geos)
 ```
 
 All the forecasts are going down rather than up, even though they have multiple weeks of data!
-More quantitatively, across all geos:
+More quantitatively, let's compute the growth rate of the median forecast for each ahead, across all geos, and then look at a histogram of that
 ```{r}
 basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
-basic_gr %>% arrange(desc(growth))
-```
-
-The only places where the growth rate is positive are American samoa and the US overall, both of which have unusual data trends (as because it is ~0, and the US because it is unusually large).
-As a histogram (each state is included 5 times, once per ahead):
-
-```{r}
+basic_gr %>% arrange(desc(growth)) %>% print(n=30)
 basic_gr %>% ggplot(aes(x = growth)) +
   geom_histogram(bins = 300)
 ```
 
+There are a few exceptional locations with actually positive growth rates, but the vast majority of aheads and geos have negative growth rates.
+
 # Model tweaks
 
 ## Very short training windows remove the decreasing problem
@@ -197,7 +207,9 @@ short_gr %>%
   ggplot(aes(x = growth)) +
   geom_histogram(bins = 300)
 ```
-So on a day-over-day basis the growth rate is mostly increasing, with some strong positive outliers and some amount decreasing.
+
+So most aheads and locations have a positive growth rate, with some strong positive outliers and some amount decreasing.
+This generally fits with the jittery but positive nature of the example location plots.
 
 ## Switching to linear regression slightly mitigates, but doesn't remove the problem
 
@@ -221,14 +233,14 @@ basic_gr %>%
   print(n = 30)
 ```
 
-There's a least a good number of locations/aheads that are positive
+There's a least a good number of locations/aheads that have a positive growth rate.
 
 ```{r}
 basic_gr %>% ggplot(aes(x = growth)) +
   geom_histogram(bins = 300)
 ```
 
-But the majority is negative.
+But the majority are negative.
 
 ## Switching to a nonlinear engine removes the decreasing problem
 
@@ -246,7 +258,7 @@ plot_dec_forecasts(all_boost_forecasts, default_geos)
 
 Boosted trees don't have the problem? Mostly?
 The forecasts aren't great, but at least they're not plummeting.
-The quantiles are garbage, but that's kind of to be expected.
+The quantiles are garbage, but that's kind of to be expected with residual quantiles on a non-linear method.
 
 ```{r}
 basic_gr <- get_growth_rates(all_boost_forecasts, quantiles = 0.5, method = "smooth_spline")
@@ -275,6 +287,7 @@ First lets examine the coefficients that are actually fit; to do that from withi
 For the sake of reproducibility, we will make the steps by hand.
 Note that I've tried this section with both filtering pre 2022 values and not, and the results are approximately the same.
 
+filtering the data
 ```{r}
 hhs_forecast <- hhs_archive %>%
   epix_as_of(forecast_date) %>%
@@ -320,7 +333,7 @@ Starting with the largest ahead, the coefficients are
 workflows::extract_fit_parsnip(all_workflows[[5]])
 ```
 
-So the intercept is actually positive (so it's not biased towards decreasing inherently), but the coefficients for the two lags are negative.
+So the intercept is actually positive (so it's not biased towards decreasing inherently), but the coefficients for two of the lags are negative.
 Even including that, the `lag_7_hhs` coefficient is less than one, so regardless of the fact that two coefficients are negative it will de-facto always be below the original value.
 
 How about the zero ahead?
@@ -330,46 +343,44 @@ Thanks to latency this is actually a one week forward projection, so we can't ex
 workflows::extract_fit_parsnip(all_workflows[[1]])
 ```
 
-And that is surprisingly close to exactly the `lag_7_hhs` value.
+But it is still surprisingly close to exactly the `lag_7_hhs` value.
 If the signal were constant so far though, it would still be predicting a decrease thanks to the `lag_21_hhs` coefficient.
 
 ## Fitting simple linear models to the data yields coefficients less than one
 
-Given that something strange is going on with the data that we're fitting, it is worth plotting the data that we're fitting.
-Since visualizing a 4D vector is a pain, let's start with the `lag = 0` case, which still has similar behavior (and in fact is the dominant coefficient above anyways).
+Given that something strange is going on with the data that we're fitting, it is worth plotting the data as it is seen by the linear regressor.
+Since visualizing a 4D vector is a pain, let's start with fitting just the `lag = 0` [case](#fitting-only-1-lag-does-not-change-the-problem), which still has similar behavior (and in fact is the dominant coefficient above anyways).
 The way to get the data is using prep and bake:
 
 ```{r}
 preproc <- linear_get_preproc(0:4 * 7)
-fit_data_long <- preproc %>%
+fit_data_long <-
+  preproc %>%
   prep(hhs_forecast) %>%
   bake(hhs_forecast) %>%
   select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
   pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
   drop_na(lag_7_hhs, value) %>%
+  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
   arrange(geo_value, time_value, ahead_value) %>%
   mutate(epi_week = epiweek(time_value))
 ```
 
-which is... positive at least.
-More importantly though, the coefficient on x is less than one, so this still decays.
-
-Comparing with the coefficient for this ahead and only this lag:
+The fit for 28 days ahead is
 
 ```{r}
 all_single_workflows <- map(0:4 * 7, \(ahead) linear_get_workflow(ahead, lag = 0))
 workflows::extract_fit_parsnip(all_single_workflows[[5]])
 ```
 
-which is the same value, so it's the same fit (as we should expect).
-Making a similar plot across the values of head:
+which is positive but less than one, and thus predicts a decrease.
+Since that data as seen by this regression is 1D, we can plot it against the target ahead, and then plot the regression.
 
 ```{r}
 plot_linear_data <- function(fit_data_long) {
   fit_data_long %>%
     ggplot(aes(x = lag_7_hhs, y = value)) +
-    geom_hex(alpha = 0.5, bins = 30) +
-    geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
+    geom_point(aes(color = epi_week), alpha = 1, size = 0.7) +
     geom_smooth(method = "lm", formula = y ~ x) +
     geom_abline(intercept = 0, slope = 1) +
     facet_wrap(~ahead_value, scales = "free") +
@@ -392,7 +403,7 @@ fit_data_long %>%
 ```
 
 which is better but still below one.
-For some reason most of the remaining points egregiously below the diagonal are in `epi_weeks` very near 50. If we filter out the last 4 weeks:
+For some reason most of the remaining points egregiously below the diagonal are in `epi_weeks` very near 50. If we also filter out the last 4 weeks:
 
 ```{r}
 fit_data_long %>%
@@ -458,10 +469,12 @@ preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
 fit_covid_data <- preproc %>%
   prep(hhs_covid_forecast) %>%
   bake(hhs_covid_forecast) %>%
-  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
+  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
 fit_covid_data %>%
   mutate(epi_week = epiweek(time_value)) %>%
   pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
   plot_linear_data()
 ```
 
@@ -633,4 +646,4 @@ Actually implementing an experiment just means including the median from the pre
 
 ## Does filtering by growth rate change it?
 
-TODO.
\ No newline at end of file
+TODO.

From adc5bc3b55a883772e1ce11bc9a8406564a1a352 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 14 Apr 2025 12:17:42 -0500
Subject: [PATCH 13/62] clearer read-through for others

---
 scripts/reports/decreasing_forecasters.Rmd | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 400cba05..69ef075c 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -450,9 +450,6 @@ hhs_covid_archive <-
   as_epi_archive()
 hhs_covid_forecast <- hhs_covid_archive %>% epix_as_of(forecast_date)
 hhs_covid_forecast %>% autoplot(hhs)
-hhs_covid_forecast %>%
-  pull(hhs) %>%
-  max()
 ```
 
 Forecasting using the same methods as the original problem
@@ -461,7 +458,17 @@ all_covid_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead =
 plot_dec_forecasts(all_covid_forecasts, default_geos, hhs_covid_archive)
 ```
 
-Still sort of present!
+Still sort of present! 
+And does it show up in a histogram of growth rates?
+
+```{r}
+basic_gr <- get_growth_rates(all_covid_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(desc(growth)) %>% print(n=30)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
+```
+
+On average the growth rate is positive, but really at this point ~none of the forecasts should have a negative slope of the median.
 Let's take a look at what the forecast is actually seeing
 
 ```{r}
@@ -469,8 +476,7 @@ preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
 fit_covid_data <- preproc %>%
   prep(hhs_covid_forecast) %>%
   bake(hhs_covid_forecast) %>%
-  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
-  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) 
 fit_covid_data %>%
   mutate(epi_week = epiweek(time_value)) %>%
   pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
@@ -487,6 +493,7 @@ fit_covid_data %>%
   mutate(epi_week = epiweek(time_value)) %>%
   filter(epi_week > 10) %>%
   pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
+  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
   plot_linear_data()
 ```
 
@@ -500,7 +507,7 @@ Some other investigations.
 
 ## Dividing by population squared
 
-This was an accidental find, but if we *do* population scale, the problem goes away.
+This was an accidental find, but if we *do* population scale the already population scaled data, the problem mostly goes away.
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)

From 2057e79e69345728ab1599574289d9d1ce7e7553 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 14 Apr 2025 12:40:10 -0500
Subject: [PATCH 14/62] tests borked b/c curl?

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 5cf55c1b..feb8cc39 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
 
       - name: System dependencies
         run: |
-          sudo apt-get update && sudo apt-get -y install libglpk-dev
+          sudo apt-get update && sudo apt-get -y install libglpk-dev libcurl4-openssl-dev
 
       - uses: r-lib/actions/setup-renv@v2
 

From 6d767b7a4b8eae80a18cecd3700b80b4a8d8f249 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 14 Apr 2025 13:18:23 -0500
Subject: [PATCH 15/62] filter pre 2022, test dependencies

---
 .github/workflows/tests.yaml               |  2 +-
 scripts/reports/decreasing_forecasters.Rmd | 50 +++++++++++++++++++++-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index feb8cc39..562f3e37 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
 
       - name: System dependencies
         run: |
-          sudo apt-get update && sudo apt-get -y install libglpk-dev libcurl4-openssl-dev
+          sudo apt-get update && sudo apt-get -y install libglpk-dev libcurl4-openssl-dev libfontconfig1-dev
 
       - uses: r-lib/actions/setup-renv@v2
 
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 69ef075c..aa44c678 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -281,6 +281,51 @@ plot_dec_forecasts(all_one_lag_forecasts, default_geos)
 
 Still constantly falling, unfortunately, and a surprisingly similar forecast.
 
+## Removing 2020/21 and 2021/22 improves but doesn't fix the situation
+The first 2 years are bad for flu, as there was ~no spread.
+In practice we know this behavior happens when we remove those years, and even the summers.
+I'm including this primarily to compare the change in growth rates.
+
+```{r}
+hhs_forecast_recent <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > "2022-06-01")
+all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast_recent)
+plot_dec_forecasts(all_recent_forecasts, default_geos)
+```
+
+Which is still decreasing.
+
+```{r}
+basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(desc(growth)) %>% print(n=30)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
+```
+
+which are more or less the same level of bad.
+### Linear only 
+However if we filter to 2022 onwards and use a linear engine:
+
+```{r}
+hhs_forecast_recent <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > "2022-06-01")
+all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, trainer = linear_reg()), epi_data = hhs_forecast_recent)
+plot_dec_forecasts(all_recent_forecasts, default_geos)
+```
+
+Some are actually increasing, while some are decreasing.
+
+```{r}
+basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
+basic_gr %>% arrange(desc(growth)) %>% print(n=30)
+basic_gr %>% ggplot(aes(x = growth)) +
+  geom_histogram(bins = 300)
+```
+
+Overall, more are now positive than they were in the totally unfiltered case, but there's still a number of unreasonably negative values.
+
 # Inspecting the linear model coefficients
 
 First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a `browser()`.
@@ -653,4 +698,7 @@ Actually implementing an experiment just means including the median from the pre
 
 ## Does filtering by growth rate change it?
 
-TODO.
+Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point.
+```{r}
+
+```

From a2dbebf366c039d62a0a2694b2b6ec83f3e77ca5 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 14 Apr 2025 15:09:23 -0700
Subject: [PATCH 16/62] enh: add forecasting on diffs (ARI rather than AR)

---
 scripts/reports/decreasing_forecasters.Rmd | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index aa44c678..fb11b755 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -696,6 +696,41 @@ Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
 Somewhat hard to express, there's also something about the increasing staleness of data that's relevant.
 Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
 
+## Forecasting on value differences and iteratively summing
+
+Does adding an I (integrated) component to our model improve things?
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+# First we take differences
+diffed_data <- hhs_forecast %>%
+  group_by(geo_value) %>%
+  mutate(value_diff = hhs - lag(hhs)) %>%
+  ungroup() %>%
+  select(geo_value, time_value, value_diff) %>%
+  filter(geo_value %in% default_geos)
+latest_values <- hhs_forecast %>%
+  filter(geo_value %in% default_geos) %>%
+  slice_max(time_value, by=geo_value) %>%
+  select(geo_value, time_value, value = hhs)
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data) %>%
+  select(geo_value, time_value = target_end_date, value, quantile) %>%
+  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
+
+forecast <- diffed_forecast %>%
+  group_by(geo_value, quantile) %>%
+  arrange(time_value) %>%
+  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
+  ungroup() %>%
+  arrange(geo_value, quantile, time_value) %>%
+  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
+
+forecast %>% plot_dec_forecasts(default_geos)
+```
+
+This is much better than the original forecasts (we're forecasting on the same
+data as in the "problem setup" section).
+
 ## Does filtering by growth rate change it?
 
 Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point.

From 8b04c04e39ca759522c6d28f3063b3e734c9c102 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 14 Apr 2025 15:12:14 -0700
Subject: [PATCH 17/62] doc: add some comments

---
 scripts/reports/decreasing_forecasters.Rmd | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index fb11b755..e779c021 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -45,8 +45,8 @@ Some possible fixes include:
 1. Some method of data filtering or weighting based on the current growth rate.
    The seasonal forecaster probably benefits from this effect.
    An alternative would be to bring back some sort of classifier-type system (either using an actual classifier, or filtering to comparable growth rates).
-2. Improve our ability to use non-linear classifiers. 
-   These would likely be able to handle forecasting both increasing and decreasing signals, but would require a better method of generating quantiles. 
+2. Improve our ability to use non-linear classifiers.
+   These would likely be able to handle forecasting both increasing and decreasing signals, but would require a better method of generating quantiles.
    For example, by comparing a given forecast to the climate model, or a comparison with the trajectories that occur on comparable forecast dates (this brings us back to 1).
 
 # Setup
@@ -304,7 +304,7 @@ basic_gr %>% ggplot(aes(x = growth)) +
 ```
 
 which are more or less the same level of bad.
-### Linear only 
+### Linear only
 However if we filter to 2022 onwards and use a linear engine:
 
 ```{r}
@@ -503,7 +503,7 @@ all_covid_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead =
 plot_dec_forecasts(all_covid_forecasts, default_geos, hhs_covid_archive)
 ```
 
-Still sort of present! 
+Still sort of present!
 And does it show up in a histogram of growth rates?
 
 ```{r}
@@ -521,7 +521,7 @@ preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
 fit_covid_data <- preproc %>%
   prep(hhs_covid_forecast) %>%
   bake(hhs_covid_forecast) %>%
-  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) 
+  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
 fit_covid_data %>%
   mutate(epi_week = epiweek(time_value)) %>%
   pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
@@ -702,21 +702,22 @@ Does adding an I (integrated) component to our model improve things?
 
 ```{r}
 hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
-# First we take differences
-diffed_data <- hhs_forecast %>%
-  group_by(geo_value) %>%
-  mutate(value_diff = hhs - lag(hhs)) %>%
-  ungroup() %>%
-  select(geo_value, time_value, value_diff) %>%
-  filter(geo_value %in% default_geos)
+# Get the latest values so we know where to sum from later
 latest_values <- hhs_forecast %>%
   filter(geo_value %in% default_geos) %>%
   slice_max(time_value, by=geo_value) %>%
   select(geo_value, time_value, value = hhs)
+# Take data diffs
+diffed_data <- hhs_forecast %>%
+  group_by(geo_value) %>%
+  mutate(value_diff = hhs - lag(hhs)) %>%
+  ungroup() %>%
+  select(geo_value, time_value, value_diff)
+# Forecast the diffs and appean the starting values from the original data
 diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data) %>%
   select(geo_value, time_value = target_end_date, value, quantile) %>%
   bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
-
+# Sum the diffs to get the final forecast
 forecast <- diffed_forecast %>%
   group_by(geo_value, quantile) %>%
   arrange(time_value) %>%
@@ -724,7 +725,6 @@ forecast <- diffed_forecast %>%
   ungroup() %>%
   arrange(geo_value, quantile, time_value) %>%
   select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
-
 forecast %>% plot_dec_forecasts(default_geos)
 ```
 

From 54850b7b3ae1dbaa1f081036320ef7a360789083 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 14 Apr 2025 17:09:57 -0500
Subject: [PATCH 18/62] growth_rate filtering

---
 .github/workflows/tests.yaml               |   2 +-
 scripts/reports/decreasing_forecasters.Rmd | 110 ++++++++++++++++++---
 2 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 562f3e37..71acda5f 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
 
       - name: System dependencies
         run: |
-          sudo apt-get update && sudo apt-get -y install libglpk-dev libcurl4-openssl-dev libfontconfig1-dev
+          sudo apt-get update && sudo apt-get -y install libglpk-dev libcurl4-openssl-dev libfontconfig1-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev
 
       - uses: r-lib/actions/setup-renv@v2
 
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index e779c021..9b4c4f70 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -48,6 +48,8 @@ Some possible fixes include:
 2. Improve our ability to use non-linear classifiers.
    These would likely be able to handle forecasting both increasing and decreasing signals, but would require a better method of generating quantiles.
    For example, by comparing a given forecast to the climate model, or a comparison with the trajectories that occur on comparable forecast dates (this brings us back to 1).
+3. Making a growth rate extrapolation model, and incorporating its predictions as a covariate.
+   And maybe something better than just simple last-growth-rate-carried-forward.  Ideally we'd have the growth rate taper based on growing population immunity.
 
 # Setup
 
@@ -340,16 +342,20 @@ hhs_forecast <- hhs_archive %>%
 ```
 
 ```{r}
-linear_get_preproc <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast) {
-  preproc <- epi_recipe(in_data) %>%
-    step_population_scaling(
-      hhs,
-      df = epidatasets::state_census,
-      df_pop_col = "pop",
-      create_new = FALSE,
-      rate_rescaling = 1e5,
-      by = c("geo_value" = "abbr")
-    ) %>%
+linear_get_preproc <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast, to_scale = FALSE) {
+  preproc <- epi_recipe(in_data)
+  if (to_scale) {
+    preproc <- preproc %>%
+      step_population_scaling(
+        hhs,
+        df = epidatasets::state_census,
+        df_pop_col = "pop",
+        create_new = FALSE,
+        rate_rescaling = 1e5,
+        by = c("geo_value" = "abbr")
+      )
+  }
+  preproc <- preproc %>%
     step_adjust_latency(hhs, method = "extend_lags") %>%
     step_epi_lag(hhs, lag = lag) %>%
     step_epi_ahead(hhs, ahead = ahead) %>%
@@ -732,8 +738,90 @@ This is much better than the original forecasts (we're forecasting on the same
 data as in the "problem setup" section).
 
 ## Does filtering by growth rate change it?
-
 Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point.
+To actually execute on this, we'll use a `step_filter` at the end with a condition based on the growth rate at the first lag.
+This should only apply during training; during testing, we just want the latest data, regardless of the growth rate.
+First we'll define the function that will filter based on the growth rate.
+### Growth rate geo based quantile
+
+```{r}
+filter_hhs <- function(geo_value, time_value, lagged, quantile_lower = 0.25, quantile_upper = 0.75, ...) {
+  growth_rates <- tibble(geo_value, time_value, lagged) %>%
+    group_by(geo_value) %>%
+    mutate(gr = growth_rate(lagged, ...))
+  gr_res <- growth_rates %>% pull(gr)
+  quantile_bounds <- gr_res %>% quantile(c(quantile_lower, quantile_upper), na.rm = TRUE)
+  (quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
+}
+```
+
+This computes the growth rate for `lagged` with whatever extra parameters are handed in, and returns a boolean vector saying if it's within the quantile range (say 25% to 75%)[^1].
+
+Since we want this to be after everything else we compute, we can just add it after all the other steps in the recipe.
+
 ```{r}
+get_filtered_workflow <- function(ahead, in_data = hhs_forecast, trainer = quantile_reg(quantile_levels = covidhub_probs()), ...) {
+  preproc <- linear_get_preproc(ahead, in_data = in_data) %>%
+  step_filter(filter_hhs(geo_value, time_value, lag_7_hhs, ...), skip = TRUE)
+  postproc <- frosting() %>%
+    layer_predict() %>%
+    layer_quantile_distn(quantile_levels = covidhub_probs()) %>%
+    layer_point_from_distn() %>%
+    layer_threshold() %>%
+    layer_naomit() %>%
+    layer_add_target_date() %>%
+    layer_add_forecast_date()
+  workflow <- epi_workflow(preproc, trainer) %>%
+    fit(hhs_forecast) %>%
+    add_frosting(postproc)
+  workflow
+}
 
+hhs_forecast <- hhs_archive %>%
+  epix_as_of(forecast_date) %>%
+  filter(time_value > train_value_min)
+```
+
+And fitting:
+
+```{r}
+fit_given_growth_rate_params <- function(...) {
+  fit_workflows <- map(0:4 * 7, \(ahead) get_filtered_workflow(ahead, ...))
+  predictions <- map(
+    fit_workflows,
+    \(wf) predict(wf, hhs_forecast %>% filter(time_value >as.Date("2023-11-29")- 7*28)) %>% pivot_quantiles_longer(.pred_distn) %>% rename(value = .pred_distn_value, quantile = .pred_distn_quantile_level) %>% filter(time_value == forecast_date)
+  )
+  predictions %>% bind_rows() %>% rename(target_end_date = target_date) %>% select(-.pred, -time_value) %>% arrange(geo_value, target_end_date, quantile)
+  plot_dec_forecasts(predictions %>% bind_rows() %>% rename(target_end_date = target_date), default_geos)
+}
+fit_given_growth_rate_params(method = "smooth_spline")
+```
+
+using the `"smooth_spline"` method didn't seem to help.
+let's try `"rel_change"`, with a window size of 5 weeks:
+
+```{r}
+fit_given_growth_rate_params(h = 7 * 5)
+```
+
+if anything that is worse.
+And linear regression, also using 5 weeks:
+
+```{r}
+fit_given_growth_rate_params(method = "linear_reg", h = 7 * 5)
+```
+
+So this doesn't seem to be helping.
+Let's take a closer look at the growth rates that we're using to do this filtering.
+
+```{r}
+growth_rates <- hhs_forecast %>%
+  group_by(geo_value) %>%
+  mutate(gr = growth_rate(hhs, method = "smooth_spline"))
+quantile_bounds <-
+  growth_rates %>%
+  pull(gr) %>%
+  quantile(c(0.25, 0.75), na.rm = TRUE)
+growth_rates %>% ggplot(aes( x = gr)) + geom_histogram(bins = 300) + geom_vline(aes(xintercept = quantile_bounds[[1]])) + scale_x_log10()
+(quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
 ```

From 24d03b1e49d5e22a7287b2c71c29083a1883783e Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 14 Apr 2025 16:29:55 -0700
Subject: [PATCH 19/62] enh: try seasonal windowing on diffs forecaster

---
 scripts/reports/decreasing_forecasters.Rmd | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 9b4c4f70..047df8ef 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -737,6 +737,76 @@ forecast %>% plot_dec_forecasts(default_geos)
 This is much better than the original forecasts (we're forecasting on the same
 data as in the "problem setup" section).
 
+Let's combine this with seasonal training data.
+
+```{r}
+hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
+# Get the latest values so we know where to sum from later
+latest_values <- hhs_forecast %>%
+  filter(geo_value %in% default_geos) %>%
+  slice_max(time_value, by=geo_value) %>%
+  select(geo_value, time_value, value = hhs)
+# Take data diffs
+diffed_data <- hhs_forecast %>%
+  group_by(geo_value) %>%
+  mutate(value_diff = hhs - lag(hhs)) %>%
+  ungroup() %>%
+  select(geo_value, time_value, value_diff)
+# Forecast the diffs and appean the starting values from the original data
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
+  select(geo_value, time_value = target_end_date, value, quantile) %>%
+  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
+# Sum the diffs to get the final forecast
+forecast <- diffed_forecast %>%
+  group_by(geo_value, quantile) %>%
+  arrange(time_value) %>%
+  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
+  ungroup() %>%
+  arrange(geo_value, quantile, time_value) %>%
+  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
+forecast %>% plot_dec_forecasts(default_geos)
+```
+
+Turns out, we get better coverage this way.
+
+Now let's try training on the augmented data (fluview and ILI). TODO.
+
+```{r, eval = FALSE}
+joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date)
+
+# Get the latest values so we know where to sum from later
+latest_values <- joined_archive_data_as_of %>%
+  filter(geo_value %in% default_geos, source == "nhsn") %>%
+  slice_max(time_value, by=geo_value) %>%
+  select(geo_value, time_value, value = hhs)
+# Take data diffs
+diffed_data <- joined_archive_data_as_of %>%
+  filter(geo_value %in% default_geos) %>%
+  filter(time_value <= forecast_date - 7) %>%
+  arrange(source, geo_value, time_value) %>%
+  group_by(geo_value, source) %>%
+  mutate(value_diff = hhs - lag(hhs)) %>%
+  filter(!near(0, value_diff)) %>%
+  ungroup() %>%
+  select(source, geo_value, time_value, value_diff) %>%
+  drop_na(value_diff) %>%
+  as_epi_df(other_keys = "source", as_of = forecast_date)
+
+# Forecast the diffs and appean the starting values from the original data
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
+  select(geo_value, time_value = target_end_date, value, quantile) %>%
+  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
+# Sum the diffs to get the final forecast
+forecast <- diffed_forecast %>%
+  group_by(geo_value, quantile) %>%
+  arrange(time_value) %>%
+  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
+  ungroup() %>%
+  arrange(geo_value, quantile, time_value) %>%
+  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
+forecast %>% plot_dec_forecasts(default_geos)
+```
+
 ## Does filtering by growth rate change it?
 Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point.
 To actually execute on this, we'll use a `step_filter` at the end with a condition based on the growth rate at the first lag.

From 7307734cdec28d34cc08fbaee786ec03aa08904d Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 14 Apr 2025 17:30:05 -0700
Subject: [PATCH 20/62] enh: do diffs forecast on flusion data

---
 R/forecasters/forecaster_scaled_pop_seasonal.R | 9 ++++++---
 scripts/reports/decreasing_forecasters.Rmd     | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
index 2fe758ab..4d6539e1 100644
--- a/R/forecasters/forecaster_scaled_pop_seasonal.R
+++ b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -52,6 +52,7 @@ scaled_pop_seasonal <- function(epi_data,
                                 quantile_levels = covidhub_probs(),
                                 filter_source = "",
                                 filter_agg_level = "",
+                                clip_lower = TRUE,
                                 ...) {
   scale_method <- arg_match(scale_method)
   center_method <- arg_match(center_method)
@@ -92,7 +93,7 @@ scaled_pop_seasonal <- function(epi_data,
   }
   args_input[["ahead"]] <- ahead
   args_input[["quantile_levels"]] <- quantile_levels
-  args_input[["nonneg"]] <- scale_method == "none"
+  args_input[["nonneg"]] <- if (!is.null(args_input[["nonneg"]])) args_input[["nonneg"]] else scale_method == "none"
   args_input[["seasonal_window"]] <- "window" %in% seasonal_method
   args_input[["seasonal_backward_window"]] <- seasonal_backward_window
   args_input[["seasonal_forward_window"]] <- seasonal_forward_window + ahead
@@ -228,8 +229,10 @@ scaled_pop_seasonal <- function(epi_data,
   pred_final <- pred %>%
     rename({{ outcome }} := value) %>%
     data_coloring(outcome, learned_params, join_cols = key_colnames(epi_data, exclude = "time_value"), nonlin_method = nonlin_method) %>%
-    rename(value = {{ outcome }}) %>%
-    mutate(value = pmax(0, value))
+    rename(value = {{ outcome }})
+  if (clip_lower) {
+    pred_final %<>% mutate(value = pmax(0, value))
+  }
   if (adding_source) {
     pred_final %<>% select(-source)
   }
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 047df8ef..2acbbce8 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -769,7 +769,7 @@ forecast %>% plot_dec_forecasts(default_geos)
 
 Turns out, we get better coverage this way.
 
-Now let's try training on the augmented data (fluview and ILI). TODO.
+Now let's try training on the augmented data (fluview and ILI).
 
 ```{r, eval = FALSE}
 joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date)
@@ -793,7 +793,7 @@ diffed_data <- joined_archive_data_as_of %>%
   as_epi_df(other_keys = "source", as_of = forecast_date)
 
 # Forecast the diffs and appean the starting values from the original data
-diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE,  scale_method = "none", center_method = "none",nonlin_method = "none", clip_lower = FALSE, nonneg = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
   select(geo_value, time_value = target_end_date, value, quantile) %>%
   bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
 # Sum the diffs to get the final forecast
@@ -807,7 +807,11 @@ forecast <- diffed_forecast %>%
 forecast %>% plot_dec_forecasts(default_geos)
 ```
 
+It doesn't look great. The median is weirdly flat. Likely need to do something
+about whitening and then undo the whitening.
+
 ## Does filtering by growth rate change it?
+
 Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point.
 To actually execute on this, we'll use a `step_filter` at the end with a condition based on the growth rate at the first lag.
 This should only apply during training; during testing, we just want the latest data, regardless of the growth rate.

From 7688a2032c7c1a307bdbed22d13635a654c745c4 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 15 Apr 2025 16:24:35 -0500
Subject: [PATCH 21/62] minor fixes

---
 scripts/reports/decreasing_forecasters.Rmd | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 2acbbce8..927888e8 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -720,11 +720,13 @@ diffed_data <- hhs_forecast %>%
   ungroup() %>%
   select(geo_value, time_value, value_diff)
 # Forecast the diffs and appean the starting values from the original data
-diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data) %>%
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data)
+diffed_forecast %<>%
   select(geo_value, time_value = target_end_date, value, quantile) %>%
-  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
+  bind_rows(tidyr::expand_grid(quantile = covidhub_probs(), latest_values))
 # Sum the diffs to get the final forecast
-forecast <- diffed_forecast %>%
+forecast <-
+  diffed_forecast %>%
   group_by(geo_value, quantile) %>%
   arrange(time_value) %>%
   mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
@@ -772,8 +774,7 @@ Turns out, we get better coverage this way.
 Now let's try training on the augmented data (fluview and ILI).
 
 ```{r, eval = FALSE}
-joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date)
-
+joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date) %>% filter(agg_level == "state")
 # Get the latest values so we know where to sum from later
 latest_values <- joined_archive_data_as_of %>%
   filter(geo_value %in% default_geos, source == "nhsn") %>%
@@ -781,19 +782,16 @@ latest_values <- joined_archive_data_as_of %>%
   select(geo_value, time_value, value = hhs)
 # Take data diffs
 diffed_data <- joined_archive_data_as_of %>%
-  filter(geo_value %in% default_geos) %>%
-  filter(time_value <= forecast_date - 7) %>%
   arrange(source, geo_value, time_value) %>%
   group_by(geo_value, source) %>%
   mutate(value_diff = hhs - lag(hhs)) %>%
-  filter(!near(0, value_diff)) %>%
   ungroup() %>%
   select(source, geo_value, time_value, value_diff) %>%
   drop_na(value_diff) %>%
   as_epi_df(other_keys = "source", as_of = forecast_date)
 
 # Forecast the diffs and appean the starting values from the original data
-diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE,  scale_method = "none", center_method = "none",nonlin_method = "none", clip_lower = FALSE, nonneg = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
+diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, nonlin_method = "none", filter_agg_level = "state", clip_lower = FALSE, nonneg = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
   select(geo_value, time_value = target_end_date, value, quantile) %>%
   bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
 # Sum the diffs to get the final forecast

From 5c40e731d161ae82358859652ab36ca274f9ef67 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Wed, 16 Apr 2025 15:21:53 -0700
Subject: [PATCH 22/62] fix+enh: minor fixes and add 0 intercept to slope
 calculation

---
 scripts/reports/decreasing_forecasters.Rmd | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 927888e8..8b7c4349 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -284,6 +284,7 @@ plot_dec_forecasts(all_one_lag_forecasts, default_geos)
 Still constantly falling, unfortunately, and a surprisingly similar forecast.
 
 ## Removing 2020/21 and 2021/22 improves but doesn't fix the situation
+
 The first 2 years are bad for flu, as there was ~no spread.
 In practice we know this behavior happens when we remove those years, and even the summers.
 I'm including this primarily to compare the change in growth rates.
@@ -306,7 +307,9 @@ basic_gr %>% ggplot(aes(x = growth)) +
 ```
 
 which are more or less the same level of bad.
+
 ### Linear only
+
 However if we filter to 2022 onwards and use a linear engine:
 
 ```{r}
@@ -334,7 +337,6 @@ First lets examine the coefficients that are actually fit; to do that from withi
 For the sake of reproducibility, we will make the steps by hand.
 Note that I've tried this section with both filtering pre 2022 values and not, and the results are approximately the same.
 
-filtering the data
 ```{r}
 hhs_forecast <- hhs_archive %>%
   epix_as_of(forecast_date) %>%
@@ -432,7 +434,7 @@ plot_linear_data <- function(fit_data_long) {
   fit_data_long %>%
     ggplot(aes(x = lag_7_hhs, y = value)) +
     geom_point(aes(color = epi_week), alpha = 1, size = 0.7) +
-    geom_smooth(method = "lm", formula = y ~ x) +
+    geom_smooth(method = "lm", formula = y ~ x+0) +
     geom_abline(intercept = 0, slope = 1) +
     facet_wrap(~ahead_value, scales = "free") +
     scale_color_viridis_c() +
@@ -895,5 +897,5 @@ quantile_bounds <-
   pull(gr) %>%
   quantile(c(0.25, 0.75), na.rm = TRUE)
 growth_rates %>% ggplot(aes( x = gr)) + geom_histogram(bins = 300) + geom_vline(aes(xintercept = quantile_bounds[[1]])) + scale_x_log10()
-(quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
+# (quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
 ```

From 572939e3e752c04b93eb1ac4a5fcc831a859f0bd Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Wed, 16 Apr 2025 15:24:24 -0700
Subject: [PATCH 23/62] wip: start season summary

---
 reports/template.md                     | 14 ++------------
 scripts/reports/season_summary_2025.Rmd | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 scripts/reports/season_summary_2025.Rmd

diff --git a/reports/template.md b/reports/template.md
index 738be7df..7b05fd52 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -8,8 +8,9 @@
 ### Scoring this season
 
 
-## Exploration Reports
+## Summary Reports
 
+- [Season Summary](season_summary.html)
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
@@ -105,10 +106,6 @@ TODO: Add descriptions.
 
 This is more closely in line with the [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn (using a much wider quantile than the default settings there).
 
-## Overall comparison
-
-This takes the best mean WIS result from each of the forecaster families below, and puts them in the same notebook for inter-family comparison.
-
 ## Forecaster Families
 
 ### AR with population scaling
@@ -152,10 +149,3 @@ $f$ is either the identity or 2 sine terms, defined so that the first has half a
 ### Flatline
 
 This is what the FluSight-baseline is based on, so they should be identical. However, at the moment, this has scaling issues.
-
-# Covid Forecasts 2024-2025
-
-For now, just AR forecasters with source-pooled data. Forecaster descriptions
-are the same as above.
-
-TODO: Get lagged correlations notebook hosted.
diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
new file mode 100644
index 00000000..ac8390e8
--- /dev/null
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -0,0 +1,24 @@
+---
+title: "Season Summary 2024-2025"
+date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+output:
+  html_document:
+    code_folding: hide
+editor_options:
+  chunk_output_type: console
+---
+
+$$\\[.4in]$$
+
+```{r echo=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+source(here::here("R/load_all.R"))
+```
+
+Draw the rest of the owl.
\ No newline at end of file

From e542fc87156168c2e1c0f2448bcf8d1d42b15528 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 16 Apr 2025 17:28:27 -0500
Subject: [PATCH 24/62] Getting backtest_mode working

---
 R/forecasters/formatters.R | 35 +++++++++++++++++++----------------
 R/targets/score_targets.R  | 10 ++++++----
 scripts/covid_hosp_prod.R  |  2 +-
 scripts/flu_hosp_prod.R    |  2 +-
 4 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
index ce42c0bb..d72c376c 100644
--- a/R/forecasters/formatters.R
+++ b/R/forecasters/formatters.R
@@ -72,24 +72,27 @@ format_flusight <- function(pred, disease = c("flu", "covid")) {
 }
 
 format_scoring_utils <- function(forecasts_and_ensembles, disease = c("flu", "covid")) {
-  forecasts_and_ensembles %>%
-    filter(!grepl("region.*", geo_value)) %>%
-    mutate(
-      reference_date = get_forecast_reference_date(forecast_date),
-      target = glue::glue("wk inc {disease} hosp"),
-      horizon = as.integer(floor((target_end_date - reference_date) / 7)),
-      output_type = "quantile",
-      output_type_id = quantile,
-      value = value
-    ) %>%
-    left_join(
-      get_population_data() %>%
-        select(state_id, state_code),
-      by = c("geo_value" = "state_id")
-    ) %>%
+  # dplyr here was unreasonably slow on 1m+ rows, so replacing with direct access
+  fc_ens <- forecasts_and_ensembles
+  fc_ens <- fc_ens[!grepl("region.*", forecasts_and_ensembles$geo_value), ]
+  fc_ens[, "reference_date"] <- get_forecast_reference_date(fc_ens$forecast_date)
+  fc_ens[, "target"] <- glue::glue("wk inc {disease} hosp")
+  fc_ens[, "horizon"] <- as.integer(floor((fc_ens$target_end_date - fc_ens$reference_date) / 7))
+  fc_ens[, "output_type"] <- "quantile"
+  fc_ens[, "output_type_id"] <- fc_ens$quantile
+  left_join(
+    fc_ens,
+    get_population_data() %>%
+      select(state_id, state_code),
+    by = c("geo_value" = "state_id")
+  ) %>%
     rename(location = state_code, model_id = forecaster) %>%
     select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value, model_id) %>%
-    drop_na()
+    drop_na() %>%
+    arrange(location, target_end_date, reference_date, output_type_id) %>%
+    group_by(location, target_end_date, reference_date) %>%
+    mutate(value = sort(value)) %>%
+    ungroup()
 }
 
 #' The quantile levels used by the covidhub repository
diff --git a/R/targets/score_targets.R b/R/targets/score_targets.R
index afa47c95..dfceb9b5 100644
--- a/R/targets/score_targets.R
+++ b/R/targets/score_targets.R
@@ -13,7 +13,7 @@ get_external_forecasts <- function(external_object_name) {
     select(forecaster, geo_value, forecast_date, target_end_date, quantile, value)
 }
 
-score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles) {
+score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles, disease) {
   truth_data <-
     nhsn_latest_data %>%
     select(geo_value, target_end_date = time_value, oracle_value = value) %>%
@@ -33,10 +33,12 @@ score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles) {
     pull(max_forecast) %>%
     min()
   forecasts_formatted <-
-    joined_forecasts_and_ensembles %>%
-    filter(forecast_date <= max_forecast_date) %>%
-    format_scoring_utils(disease = "covid")
+    joined_forecasts_and_ensembles[joined_forecasts_and_ensembles$forecast_date <= max_forecast_date,] %>%
+    format_scoring_utils(disease = disease)
   scores <- forecasts_formatted %>%
+    arrange(location, target_end_date, reference_date, output_type_id) %>%
+    group_by(location, target_end_date, reference_date) %>%
+    mutate(value = sort(value)) %>%
     filter(location %nin% c("US", "60", "66", "78")) %>%
     hubEvals::score_model_out(
       truth_data,
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index a8942b7a..5960e20a 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -435,7 +435,7 @@ if (g_backtest_mode) {
     tar_target(
       name = scores,
       command = {
-        score_forecasts(nhsn_latest_data, joined_forecasts_and_ensembles)
+        score_forecasts(nhsn_latest_data, joined_forecasts_and_ensembles, "covid")
       }
     ),
     tar_target(
diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 75ad7cbb..3323231b 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -481,7 +481,7 @@ if (g_backtest_mode) {
           mutate(
             time_value = ceiling_date(time_value, unit = "week", week_start = 6)
           )
-        score_forecasts(nhsn_latest_end_of_week, joined_forecasts_and_ensembles)
+        score_forecasts(nhsn_latest_end_of_week, joined_forecasts_and_ensembles, "flu")
       }
     ),
     tar_target(

From 0f4a119294d1e8bedc045f56d9a753711a578f79 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 16 Apr 2025 18:13:08 -0500
Subject: [PATCH 25/62] revision summary notebook

---
 .../reports/revision_summary_report_2025.Rmd  | 339 ++++++++++++++++++
 1 file changed, 339 insertions(+)
 create mode 100644 scripts/reports/revision_summary_report_2025.Rmd

diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
new file mode 100644
index 00000000..f55270ca
--- /dev/null
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -0,0 +1,339 @@
+---
+title: "Revision summary 2025"
+author: Delphi Forecast Team
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
+output:
+  html_document:
+    code_folding: hide
+    toc: True
+    # self_contained: False
+    # lib_dir: libs
+params:
+  disease: "covid"
+  scores: ""
+  forecast_dates: ""
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+```{r echo=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+```
+
+```{r setup, include=FALSE}
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
+```
+
+# Overall takeaways
+
+There is substantial underreporting behavior that is fairly consistent for a single geo.
+It is likely usable for forecasting.
+Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
+We should look into the extra columns to see if it provides useful information for handling revision behavior.
+# Flu revision
+## NHSN
+### Revision behavior
+First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
+
+```{r}
+nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19") %>% as_epi_archive()
+nhsn_archive_flu$time_type <- "day"
+revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL,)
+revision_summary %>% print(quick_revision = 7)
+```
+
+So around a fifth have no revisions, around a quarter resolve within a week, and around 2/3rds have a small number of revisions.
+Around half of those with revisions have little relative change (10% of the max value).
+The "actual value" change isn't really worth thinking about because this is counts data (so there being only 6 doesn't tell us much).
+
+Here's a plot of the version changes for all locations.
+
+```{r}
+nhsn_archive_flu %>% autoplot() + theme(strip.text.x = element_text(size = 2))
+```
+
+Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
+Worst in this case meaning worst average relative spread
+
+```{r}
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+av_re_spread %>%
+  print(n=20)
+```
+
+The worst 9 excluding the geo's we don't actually forecast (so no `as` or `vi`, `gu`).
+
+```{r}
+nhsn_archive_flu %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+And the next 9 worse
+
+```{r}
+nhsn_archive_flu %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+
+These cover a range of revision behaviors; some are off on a single `time_value` in a terrible way, such as `mn` or `nh`, while most have fairly constant revisioning.
+`az` is unusual in having two bad `version`s when they revised the entire history down by a factor of 2, and then reverted to what it had been previously.
+Most are systematically reporting lower than the actual values, with some such as `nm`, `nh`, `ok`, and `ak` particularly bad about this.
+It is probably worth adding a measure at the key level of how systematic the bias is to `revision_analysis`, or perhaps a separate function.
+These seem likely to be estimable beforehand.
+Exceptions that include at least one case of over-reporting: `mn`, `id`, `nh`
+These are not later backfilled; they seem more like bad estimates or entry error.
+### Data substitions
+And we actually need to compare this with the data revision estimates:
+```{r}
+data_substitutions <- readr::read_csv(
+    here::here(glue::glue("flu_data_substitutions.csv")),
+    comment = "#",
+    show_col_types = FALSE
+  ) %>%
+  mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+nhsn_archive_flu %>%
+  autoplot(value, .facet_filter = geo_value %in% (data_substitutions$geo_value %>% unique())) + geom_point(data = data_substitutions, aes(x = time_value, y = value)) + facet_wrap(~geo_value, scale = "free")
+```
+
+which doesn't look all that great. 
+To calculate how much closer (or further) we were from the final value, first we construct the relevant snapshots:
+```{r}
+final_values <- nhsn_archive_flu %>% epix_as_of_current() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+data_as_it_was <- map(
+  data_substitutions$forecast_date %>% unique(),
+  \(version) nhsn_archive_flu %>% epix_as_of(version) %>% mutate(forecast_date = version)
+) %>%
+  list_rbind() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+```
+
+realigning all of
+
+```{r}
+final_values %>% filter(time_value > "2025-01-03")
+full_table <- data_substitutions %>%
+  left_join(
+    final_values,
+    by = join_by(geo_value, time_value), suffix = c("_substituted", "_final")
+  ) %>%
+  left_join(
+    data_as_it_was,
+    by = join_by(geo_value, forecast_date, time_value)
+  ) %>%
+  rename(as_of_value = value) %>%
+  mutate()
+  
+diffs <- full_table %>%
+  mutate(
+    abs_diff = value_substituted - value_final,
+    rel_diff = abs_diff / value_final,
+    rel_rev_diff = abs_diff / (-value_final + as_of_value),
+    ) %>%
+  arrange(rel_diff)
+diffs %>%
+  select(-forecast_date, -value_substituted, -value_final, -as_of_value) %>%
+  print(n = 23)
+```
+
+The table is sorted by `rel_diff`.
+- `abs_diff` gives how much our adjustment was over by (so positive means we were over the latest value).
+
+- `rel_diff` gives the difference relative to the latest value; for `nh` on `03/29` (the last entry in the table), our estimate was nearly twice as large as the latest value.
+
+- `rel_rev_diff` on the other hand divides that by how much the value as of the forecast date was off; for `nh` on `03/29` again, it was merely 73%, so we did bring it closer to the actual value. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
+
+How many did we substitute a more accurate value?
+
+```{r}
+mean(abs(diffs$rel_rev_diff) < 1)
+```
+
+34%, so not a great track record overall.
+How about lower than the target vs higher than the target?
+```{r}
+diffs %>%
+  mutate(is_over = rel_diff > 0) %>%
+  group_by(is_over) %>%
+  summarize(fraction_improved = mean(abs(rel_rev_diff) < 1))
+```
+
+So we did marginally better when it was below, but much worse when it was above.
+
+Overall, it turns out our value substitutions did not actually help much.
+
+## NSSP
+### Revision behavior
+### Correlation with latest
+Does NSSP actually correlate better with the latest value than nhsn itself does?
+This was a property we were relying on through the season to generate our corrections.
+
+# Covid revision
+And now for ~ the same idea, but for covid
+## NHSN
+First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
+
+```{r}
+nhsn_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_covid <- nhsn_archive_covid$DT %>% filter(time_value >= "2024-11-19") %>%
+  filter(geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_covid$time_type <- "day"
+revision_summary <- nhsn_archive_covid %>%
+  epiprocess::revision_analysis(value, min_waiting_period = NULL) 
+revision_summary %>% print(quick_revision = 7)
+```
+
+Around a fifth have no revisions, around a third resolve within a week, and around 3/4ths have a small number of revisions.
+Around 60% of those with revisions have little relative change (10% of the max value).
+The "actual value" change isn't really worth thinking about because this is counts data (so there being only 6 doesn't tell us much).
+
+Here's a plot of the version changes for all locations.
+
+```{r}
+nhsn_archive_covid %>% autoplot() + theme(strip.text.x = element_text(size = 8))
+```
+
+Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
+Worst in this case meaning worst average relative spread
+
+```{r}
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+av_re_spread %>%
+  print(n=20)
+```
+
+Which, if you compare with the average relative spread in the [Flu section](##NHSN) is remarkably similar.
+The worst 9 excluding the geo's we don't actually forecast (so no `as` or `vi`, `gu`).
+
+```{r}
+nhsn_archive_covid %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+And the next 9 worse
+
+```{r}
+nhsn_archive_covid %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+Strictly visually, this seems to revise more than the flu data (this doesn't actually fit well with the numeric revision analysis so something odd is going on).
+Perhaps the revisions are more chaotic.
+`mn` instead of having a single time point for a single version has it's entire trajectory wrong in the same way that `az` does, possibly on the same `version`.
+`az` has similar revision behavior (a couple of weeks where they were wildly off).
+
+`nh` has similar behavior in over-estimating a specific single time value quite badly, but otherwise having typical under-reporting problems.
+
+`dc` has some new and wild behavior; this is likely as visually striking as it is because the values are so small.
+
+`mo` has new revision behavior, primarily in the magnitude of the difference.
+
+`ok` has more extreme under-reporting than in the case of flu, but again this seems to likely be a factor of the number of cases, suggesting their underreporting happens in absolute number of cases rather than relative[^2].
+
+Like flu, these seem likely to be estimable beforehand.
+### Data substitutions
+And we actually need to compare revision behavior with our estimates of the correct values:
+```{r}
+data_substitutions <- readr::read_csv(
+    here::here(glue::glue("covid_data_substitutions.csv")),
+    comment = "#",
+    show_col_types = FALSE
+  ) %>%
+  mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+nhsn_archive_covid %>%
+  autoplot(value, .facet_filter = geo_value %in% (data_substitutions$geo_value %>% unique())) + geom_point(data = data_substitutions, aes(x = time_value, y = value)) + facet_wrap(~geo_value, scale = "free")
+```
+
+To calculate how much closer (or further) we were from the final value, first we construct the relevant snapshots:
+```{r}
+final_values <- nhsn_archive_covid %>% epix_as_of_current() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+data_as_it_was <- map(
+  data_substitutions$forecast_date %>% unique(),
+  \(version) nhsn_archive_covid %>% epix_as_of(version) %>% mutate(forecast_date = version)
+) %>%
+  list_rbind() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+```
+
+
+```{r}
+final_values %>% filter(time_value > "2025-01-03")
+full_table <- data_substitutions %>%
+  left_join(
+    final_values,
+    by = join_by(geo_value, time_value), suffix = c("_substituted", "_final")
+  ) %>%
+  left_join(
+    data_as_it_was,
+    by = join_by(geo_value, forecast_date, time_value)
+  ) %>%
+  rename(as_of_value = value) %>%
+  mutate()
+  
+diffs <- full_table %>%
+  mutate(
+    abs_diff = value_substituted - value_final,
+    rel_diff = abs_diff / value_final,
+    rel_rev_diff = abs_diff / (-value_final + as_of_value),
+    ) %>%
+  arrange(rel_diff)
+diffs %>%
+  select(-forecast_date, -value_substituted, -value_final, -as_of_value) %>%
+  print(n = 23)
+```
+
+The table is sorted by `rel_diff`.
+- `abs_diff` gives how much our adjustment was over by (so positive means we were over the latest value).
+
+- `rel_diff` gives the difference relative to the latest value.
+
+- `rel_rev_diff` on the other hand divides that by how much the value as of the forecast date was off. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
+
+We did fewer substitutions in Covid, but they were more regularly good, and there were no cases where we shouldn't have changed the value and did.
+
+How many did we substitute a more accurate value?
+
+```{r}
+mean(abs(diffs$rel_rev_diff) < 1)
+```
+
+50%, so overall basically guessing.
+How about lower than the target vs higher than the target?
+```{r}
+diffs %>%
+  mutate(is_over = rel_diff > 0) %>%
+  group_by(is_over) %>%
+  summarize(fraction_improved = mean(abs(rel_rev_diff) < 1))
+```
+
+When we were under, we always improved the situation. However when we were above, sometimes we made it worse.
+
+
+# Correlations between the two archives
+
+[^1]: `min_waiting_period` is `NULL` here since we're plotting mid-season, while `quick_revision = 7` because this is weekly data represented using days (because the versions are days).
+
+[^2]: which would definitely be a bit odd of an effect.

From ddab61c018575a5d3bc1ec1088d5240b3c4c8b98 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 18 Apr 2025 16:22:24 -0500
Subject: [PATCH 26/62] Basic revision summary complete

---
 reports/template.md                           |   1 +
 .../reports/revision_summary_report_2025.Rmd  | 114 ++++++++++++++++--
 2 files changed, 103 insertions(+), 12 deletions(-)

diff --git a/reports/template.md b/reports/template.md
index 7b05fd52..fb6f4dbe 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -12,6 +12,7 @@
 
 - [Season Summary](season_summary.html)
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
+- [Revision Behavior](revision_summary_2025.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
 ### Flu
diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
index f55270ca..896ad030 100644
--- a/scripts/reports/revision_summary_report_2025.Rmd
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -31,7 +31,7 @@ body .main-container {
 ```{r echo=FALSE}
 knitr::opts_chunk$set(
   fig.align = "center",
-  message = FALSE,
+  message = TRUE,
   warning = FALSE,
   cache = FALSE
 )
@@ -45,20 +45,27 @@ suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 # Overall takeaways
 
 There is substantial underreporting behavior that is fairly consistent for a single geo.
-It is likely usable for forecasting.
+We can probably improve our forecasts by including revision behavior.
 Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
 We should look into the extra columns to see if it provides useful information for handling revision behavior.
+
 # Flu revision
+
 ## NHSN
+
 ### Revision behavior
+
 First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
 
 ```{r}
 nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
 
-nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19") %>% as_epi_archive()
+nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
 nhsn_archive_flu$time_type <- "day"
-revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL,)
+revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+```
+
+```{r}
 revision_summary %>% print(quick_revision = 7)
 ```
 
@@ -68,8 +75,9 @@ The "actual value" change isn't really worth thinking about because this is coun
 
 Here's a plot of the version changes for all locations.
 
-```{r}
-nhsn_archive_flu %>% autoplot() + theme(strip.text.x = element_text(size = 2))
+```{r, out.width = "120%"}
+text_size <- 6
+nhsn_archive_flu %>% autoplot(value) + theme(strip.text.x = element_text(size = text_size, margin = margin(.1, 0, .1, 0, "cm")), axis.text = element_text(size =text_size, angle = 45), legend.title = element_text(size = text_size), legend.text = element_text(size = text_size), legend.key.size = unit(0.5, "cm")) + scale_size_manual(values = c(0.5))
 ```
 
 Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
@@ -105,6 +113,7 @@ It is probably worth adding a measure at the key level of how systematic the bia
 These seem likely to be estimable beforehand.
 Exceptions that include at least one case of over-reporting: `mn`, `id`, `nh`
 These are not later backfilled; they seem more like bad estimates or entry error.
+
 ### Data substitions
 And we actually need to compare this with the data revision estimates:
 ```{r}
@@ -129,10 +138,9 @@ data_as_it_was <- map(
   list_rbind() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
 ```
 
-realigning all of
+and then we compute several notions of differences
 
 ```{r}
-final_values %>% filter(time_value > "2025-01-03")
 full_table <- data_substitutions %>%
   left_join(
     final_values,
@@ -158,11 +166,13 @@ diffs %>%
 ```
 
 The table is sorted by `rel_diff`.
+
 - `abs_diff` gives how much our adjustment was over by (so positive means we were over the latest value).
 
 - `rel_diff` gives the difference relative to the latest value; for `nh` on `03/29` (the last entry in the table), our estimate was nearly twice as large as the latest value.
+  This is mostly to compensate for the different magnitudes of values across time and geo.
 
-- `rel_rev_diff` on the other hand divides that by how much the value as of the forecast date was off; for `nh` on `03/29` again, it was merely 73%, so we did bring it closer to the actual value. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
+- `rel_rev_diff` is the most appropriate to view as a substitution scoring. It divides the `abs_diff` by how much the value as of the forecast date was off; for `nh` on `03/29` again, it was merely 73%, so we did bring it closer to the actual value. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
 
 How many did we substitute a more accurate value?
 
@@ -170,7 +180,7 @@ How many did we substitute a more accurate value?
 mean(abs(diffs$rel_rev_diff) < 1)
 ```
 
-34%, so not a great track record overall.
+around 35%, so not a great track record overall.
 How about lower than the target vs higher than the target?
 ```{r}
 diffs %>%
@@ -181,16 +191,53 @@ diffs %>%
 
 So we did marginally better when it was below, but much worse when it was above.
 
-Overall, it turns out our value substitutions did not actually help much.
+Overall, it turns out our value substitutions did not actually help much for flu.
 
 ## NSSP
+
+```{r}
+nssp_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nssp_archive_data"))
+
+nssp_archive_flu <- nssp_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nssp_archive_flu$time_type <- "day"
+revision_summary_nssp <- nssp_archive_flu %>% epiprocess::revision_analysis(nssp, min_waiting_period = NULL)
+```
+
+```{r}
+revision_summary_nssp %>% print(quick_revision = 7)
+```
+
+So very few weeks have no revision, but ~70% have only a small number of revisions.
+And most (87%) are very close to their final values
+
+```{r}
+av_re_spread <- revision_summary_nssp$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+```
+
+
+```{r}
+nssp_archive_flu %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+```{r}
+nssp_archive_flu %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+These corrections are *way* more regular than nhsn, so we can definitely do backcasting to adjust the values.
+
 ### Revision behavior
 ### Correlation with latest
+
 Does NSSP actually correlate better with the latest value than nhsn itself does?
 This was a property we were relying on through the season to generate our corrections.
 
 # Covid revision
 And now for ~ the same idea, but for covid
+
 ## NHSN
 First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
 
@@ -212,7 +259,7 @@ The "actual value" change isn't really worth thinking about because this is coun
 Here's a plot of the version changes for all locations.
 
 ```{r}
-nhsn_archive_covid %>% autoplot() + theme(strip.text.x = element_text(size = 8))
+nhsn_archive_covid %>% autoplot(value) + theme(strip.text.x = element_text(size = text_size, margin = margin(.1, 0, .1, 0, "cm")), axis.text = element_text(size =text_size, angle = 45), legend.title = element_text(size = text_size), legend.text = element_text(size = text_size), legend.key.size = unit(0.5, "cm")) + scale_size_manual(values = c(0.5))
 ```
 
 Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
@@ -255,6 +302,7 @@ Perhaps the revisions are more chaotic.
 `ok` has more extreme under-reporting than in the case of flu, but again this seems to likely be a factor of the number of cases, suggesting their underreporting happens in absolute number of cases rather than relative[^2].
 
 Like flu, these seem likely to be estimable beforehand.
+
 ### Data substitutions
 And we actually need to compare revision behavior with our estimates of the correct values:
 ```{r}
@@ -332,8 +380,50 @@ diffs %>%
 When we were under, we always improved the situation. However when we were above, sometimes we made it worse.
 
 
+
+## NSSP
+
+```{r}
+nssp_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nssp_archive_data"))
+
+nssp_archive_covid <- nssp_archive_covid$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nssp_archive_covid$time_type <- "day"
+revision_summary_nssp <- nssp_archive_covid %>% epiprocess::revision_analysis(nssp, min_waiting_period = NULL)
+```
+
+```{r}
+revision_summary_nssp %>% print(quick_revision = 7)
+```
+
+So few weeks have no revision (only 14%), but ~87% have only a small number of revisions.
+And most (81%) are close to their final values
+
+```{r}
+av_re_spread <- revision_summary_nssp$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+```
+
+
+```{r}
+nssp_archive_covid %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+```{r}
+nssp_archive_covid %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+A similar story to flu, these corrections are *way* more regular than nhsn, so we can definitely do backcasting to adjust the values.
+
 # Correlations between the two archives
 
+Visually, the flu and covid nhsn datasets seem to have related revision behavior.
+We should probably study this more carefully, but the question is a somewhat difficult one.
+One potential way to go about this is for each `(time_value, geo_value)` pair, do a correlation of the time series across `version`.
+Alteratively, we could do the same, but for the differences between versions (so correlate the correction amount).
+
 [^1]: `min_waiting_period` is `NULL` here since we're plotting mid-season, while `quick_revision = 7` because this is weekly data represented using days (because the versions are days).
 
 [^2]: which would definitely be a bit odd of an effect.

From 3c6de3b4047f9f9c4107b851b7ae82218d5192b2 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 22 Apr 2025 14:21:52 -0500
Subject: [PATCH 27/62] scores mix all forecasters, first covid day problems

---
 R/forecasters/formatters.R                    | 14 +++++-----
 R/targets/score_targets.R                     |  3 ---
 reports/template.md                           |  5 ++--
 scripts/covid_hosp_prod.R                     | 26 +++++++++++++++++-
 scripts/flu_hosp_prod.R                       | 27 ++++++++++++++++++-
 .../reports/revision_summary_report_2025.Rmd  |  2 +-
 6 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
index d72c376c..f8aed36d 100644
--- a/R/forecasters/formatters.R
+++ b/R/forecasters/formatters.R
@@ -80,17 +80,17 @@ format_scoring_utils <- function(forecasts_and_ensembles, disease = c("flu", "co
   fc_ens[, "horizon"] <- as.integer(floor((fc_ens$target_end_date - fc_ens$reference_date) / 7))
   fc_ens[, "output_type"] <- "quantile"
   fc_ens[, "output_type_id"] <- fc_ens$quantile
-  left_join(
-    fc_ens,
-    get_population_data() %>%
-      select(state_id, state_code),
-    by = c("geo_value" = "state_id")
-  ) %>%
+  fc_ens %>%
+    left_join(
+      get_population_data() %>%
+        select(state_id, state_code),
+      by = c("geo_value" = "state_id")
+    ) %>%
     rename(location = state_code, model_id = forecaster) %>%
     select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value, model_id) %>%
     drop_na() %>%
     arrange(location, target_end_date, reference_date, output_type_id) %>%
-    group_by(location, target_end_date, reference_date) %>%
+    group_by(model_id, location, target_end_date, reference_date) %>%
     mutate(value = sort(value)) %>%
     ungroup()
 }
diff --git a/R/targets/score_targets.R b/R/targets/score_targets.R
index dfceb9b5..a0648ed6 100644
--- a/R/targets/score_targets.R
+++ b/R/targets/score_targets.R
@@ -36,9 +36,6 @@ score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles, di
     joined_forecasts_and_ensembles[joined_forecasts_and_ensembles$forecast_date <= max_forecast_date,] %>%
     format_scoring_utils(disease = disease)
   scores <- forecasts_formatted %>%
-    arrange(location, target_end_date, reference_date, output_type_id) %>%
-    group_by(location, target_end_date, reference_date) %>%
-    mutate(value = sort(value)) %>%
     filter(location %nin% c("US", "60", "66", "78")) %>%
     hubEvals::score_model_out(
       truth_data,
diff --git a/reports/template.md b/reports/template.md
index fb6f4dbe..a295761b 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -10,9 +10,10 @@
 
 ## Summary Reports
 
-- [Season Summary](season_summary.html)
+- [Season Summary](season_summary.html) The other documents are also linked from here
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
-- [Revision Behavior](revision_summary_2025.html)
+- [Revision Behavior](revision_summary_report_2025.html)
+- [Covid's problematic initial forecast](first_day_wrong.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
 ### Flu
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index 5960e20a..739a619c 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -429,7 +429,31 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        dplyr::bind_rows(!!!.x, external_forecasts)
+        local_together <- dplyr::bind_rows(!!!.x)
+        # only seek to score on dates that are present both locally and in the
+        # remote
+        viable_dates <- inner_join(
+          local_together %>%
+            select(geo_value, forecast_date) %>%
+            distinct() %>%
+            arrange(forecast_date) %>%
+            mutate(
+              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
+            ),
+          external_forecasts %>%
+            select(geo_value, forecast_date) %>%
+            distinct() %>% filter(forecast_date > "2024-10-01"),
+          by = c("geo_value", "forecast_date")
+        )
+        dplyr::bind_rows(
+          local_together %>%
+            mutate(
+              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
+            ) %>%
+            inner_join(viable_dates, by = c("geo_value", "forecast_date")),
+          external_forecasts %>%
+            inner_join(viable_dates, by = c("geo_value", "forecast_date"))
+        )
       }
     ),
     tar_target(
diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 3323231b..871094dd 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -470,12 +470,37 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        dplyr::bind_rows(!!!.x, external_forecasts)
+        local_together <- dplyr::bind_rows(!!!.x)
+        # only seek to score on dates that are present both locally and in the
+        # remote
+        viable_dates <- inner_join(
+          local_together %>%
+            select(geo_value, forecast_date) %>%
+            distinct() %>%
+            arrange(forecast_date) %>%
+            mutate(
+              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
+            ),
+          external_forecasts %>%
+            select(geo_value, forecast_date) %>%
+            distinct() %>% filter(forecast_date > "2024-10-01"),
+          by = c("geo_value", "forecast_date")
+        )
+        dplyr::bind_rows(
+          local_together %>%
+            mutate(
+              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
+            ) %>%
+            inner_join(viable_dates, by = c("geo_value", "forecast_date")),
+          external_forecasts %>%
+            inner_join(viable_dates, by = c("geo_value", "forecast_date"))
+        )
       }
     ),
     tar_target(
       name = scores,
       command = {
+        browser()
         nhsn_latest_end_of_week <-
           nhsn_latest_data %>%
           mutate(
diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
index 896ad030..a161684b 100644
--- a/scripts/reports/revision_summary_report_2025.Rmd
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -44,7 +44,7 @@ suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 
 # Overall takeaways
 
-There is substantial underreporting behavior that is fairly consistent for a single geo.
+There is substantial under-reporting behavior that is fairly consistent for a single geo.
 We can probably improve our forecasts by including revision behavior.
 Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
 We should look into the extra columns to see if it provides useful information for handling revision behavior.

From b06f23a3fae805ff88d820c899b8b5f79d44454d Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 22 Apr 2025 18:10:24 -0500
Subject: [PATCH 28/62] phase definitions and scores

---
 scripts/reports/season_summary_2025.Rmd | 495 +++++++++++++++++++++++-
 1 file changed, 494 insertions(+), 1 deletion(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index ac8390e8..42be4b3b 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -8,6 +8,20 @@ editor_options:
   chunk_output_type: console
 ---
 
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
 $$\\[.4in]$$
 
 ```{r echo=FALSE}
@@ -21,4 +35,483 @@ ggplot2::theme_set(ggplot2::theme_bw())
 source(here::here("R/load_all.R"))
 ```
 
-Draw the rest of the owl.
\ No newline at end of file
+# Season Scoring
+<!-- NOTE: to run this, you must run `make prod-flu` and `make prod-covid` successfully, as it will read the targets directly from the store. -->
+
+```{r setup, include=FALSE}
+library(DT)
+
+# Define aggregation functions
+Mean <- function(x) mean(x, na.rm = TRUE)
+GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
+flu_scores <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores"))
+forecast_dates <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "forecast_dates"))
+covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
+```
+
+Some overall conclusions here about the scores for the season.
+
+## Flu Scores
+
+Some conclusions here about the scores for the season.
+
+
+Before we get into the actual scores, we need to define how we go about creating 3 different phases.
+For the details, see the fold.
+They are `increasing`, `peak`, and `decreasing`.
+<details>
+  <summary> Splitting the season </summary>
+### Splitting the season
+
+Since our forecasters tend to do very differently depending on the phase in the pandemic, in addition to an overall score, let's split according to phase.
+There's a great deal of ambiguity in defining the phase however; to keep it simple, lets divide the season into 3 periods:
+
+1. `increasing` Before the peak; normally increasing but may include inital flat periods
+2. `peak` The time interval where the cases are oscillating near or around the peak
+3. `decreasing` The trailing end of the season after the peak; normally decreasing, but probably including flat periods towards the end
+
+2 is the most ambiguous of these, since sometimes there is a clean peak, and sometimes there are multiple peaks.
+To do this simply, let's see what seasons we get if we use "above 50% of the peak value" to define phase 2.
+
+```{r}
+flu_archive <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+flu_current <- flu_archive %>%
+  epix_as_of_current() %>%
+  filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+compute_peak_season <- function(data_current, threshold = 0.5, start_of_year = as.Date("2024-11-01")) {
+  season_length <- data_current %>% pull(time_value) %>% max() - start_of_year
+  data_current %>%
+    filter(time_value > start_of_year) %>%
+    group_by(geo_value) %>%
+    mutate(max_val = max(value)) %>%
+    filter(value >= threshold * max_val) %>%
+    summarize(first_above = min(time_value), last_above = max(time_value)) %>%
+    mutate(
+      duration = last_above - first_above,
+      rel_duration = as.integer(duration) / as.integer(season_length))
+}
+classify_phase <- function(time_value, first_above, last_above, rel_duration, threshold) {
+  case_when(
+    rel_duration > threshold ~ "flat",
+    time_value < first_above ~ "increasing",
+    time_value > last_above ~ "decreasing",
+    .default = "peak"
+  ) %>% factor(levels = c("increasing", "peak", "decreasing", "flat"))
+}
+flu_within_max <- compute_peak_season(flu_current)
+sanity_check_classifying <- flu_current %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(time_value, first_above, last_above, rel_duration, 0.6)) %>%
+  group_by(geo_value) %>%
+  distinct(phase)
+```
+
+
+```{r, fig.width = 15, fig.height = 15}
+flu_current %>%
+  filter(time_value > "2024-11-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(data = flu_within_max, aes(xintercept = first_above)) +
+  geom_vline(data = flu_within_max, aes(xintercept = last_above)) +
+  facet_wrap(~geo_value, scale = "free") +
+  theme(legend.position = "none")
+```
+
+There is a wide variety of length for the peak by this definition, but it does seem to naturally reflect the difference in dynamics.
+`ok` is quite short for example, because it has a simple clean peak, whereas `or` has literally 2 peaks with the same height, so the entire interval between them is classified as peak.
+
+</details>
+### Forecaster Scores for Flu:  {.tabset}
+
+Forecast dates: `r forecast_dates`
+
+#### Scores Aggregated By Forecaster
+
+```{r, fig.height = 60, fig.width = 12, echo=FALSE}
+flu_scores %>%
+  group_by(forecaster) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_coverage_90 = round(Mean(interval_coverage_90), 2),
+    n = n()
+  ) %>%
+  rename(id = forecaster) %>%
+  datatable()
+```
+
+#### Scores Aggregated By Phase
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <- flu_scores %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 0.6)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  geom_point() +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS") 
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Forecast Date
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg_flu <- flu_scores %>%
+  filter(forecast_date > "2024-10-01") %>%
+  filter(forecast_date != as.Date("2025-01-25")) %>%
+  group_by(forecaster, forecast_date) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg_flu, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Forecast Date", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Ahead
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg <- flu_scores %>%
+  group_by(forecaster, ahead) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Ahead", y = "Mean WIS")
+ggplotly(p)
+```
+
+
+## Covid Scores
+
+Some conclusions here about the scores for the season.
+
+One peculiar thing about Covid scoring: the first day has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab below).
+This mostly comes from the first week having larger revisions than normal.
+This is discussed in more detail in [this notebook](first_day_wrong.html).
+
+
+Before we get into the actual scores, we need to define how we go about creating 4 different phases.
+They are `increasing`, `peak`, `decreasing`, and `flat`.
+The last phase, `flat`, covers geos which didn't have an appreciable season for the year, which was relatively common for covid.
+For the details, see the fold.
+
+<details>
+  <summary> Splitting the season </summary>
+### Splitting the season
+
+Lets see what kind of phase boundaries we get by reusing the concept from flu for covid.
+That is, the peak phase for a given geo is defined to be the interval when the value is within 50% of the peak.
+
+```{r}
+covid_archive <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+covid_current <- covid_archive %>%
+  epix_as_of_current() %>%
+  filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+covid_within_max <- compute_peak_season(covid_current)
+```
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2024-11-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(data = covid_within_max, aes(xintercept = first_above)) +
+  geom_vline(data = covid_within_max, aes(xintercept = last_above)) +
+  facet_wrap(~geo_value, scale = "free") +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+This definition may be a bit more problematic for covid than for flu.
+`dc` `ga`, `nc`, and `nv` bin ~the entire season into the "peak" phase.
+Primarily this is actually because only some locations actually had a covid season this year; if we drop the filter for this season and add a vline indicating the season start:
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2022-06-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(aes(xintercept = as.Date("2024-11-01"))) +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+Then we can see a very muted season in many locations, such as `ar` or `co`, and no season at all in some locations, such as `ak`.
+Others, such as `az`, `in`, or `mn` have a season that is on-par with historical ones.
+
+How to handle this? 
+One option is to include a separate phase for no season that applies to the entire `geo_value` if more than half of the `time_value`s are within 50% of the peak:
+
+```{r}
+no_phase <- covid_within_max %>% arrange(desc(rel_duration)) %>% filter(rel_duration > 0.6)
+no_phase %>% arrange(rel_duration)
+```
+
+which is 27 out of our 53 geos.
+0.6 is admittedly a pretty arbitrary cut-off, chosen so that `geo_value`s with high `rel_duration`s which still appear to have a clear peak, such as `de`, `us`, `wy`, and `mi` aren't assigned to `flat`.
+We can probably decrase this filter as we move later into the season and locations which are still decreasing finish dropping.
+As a sanity check, let's plot just these locations to confirm that we're not pulling in geos with actual peaks
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2022-06-01") %>%
+  filter(geo_value %in% no_phase$geo_value) %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(aes(xintercept = as.Date("2024-11-01"))) +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+
+
+</details>
+### Forecaster Scores for Covid:  {.tabset}
+
+Forecast dates: `r forecast_dates`
+
+#### Scores Aggregated By Forecaster
+
+```{r, fig.height = 60, fig.width = 12, echo=FALSE}
+covid_scores %>%
+  group_by(forecaster) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_coverage_90 = round(Mean(interval_coverage_90), 2),
+    n = n()
+  ) %>%
+  rename(id = forecaster) %>%
+  datatable()
+```
+
+#### Scores Aggregated By Phase
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <- covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 0.6)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  geom_point() +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS")
+
+ggplotly(p)
+```
+#### Scores Aggregated By Forecast Date
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg_flu <- covid_scores %>%
+  filter(forecast_date > "2024-10-01") %>%
+  filter(forecast_date != as.Date("2025-01-25")) %>%
+  group_by(forecaster, forecast_date) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg_flu, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Forecast Date", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Ahead
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg <- covid_scores %>%
+  group_by(forecaster, ahead) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Ahead", y = "Mean WIS")
+ggplotly(p)
+```
+
+# Revision behavior and data substitution
+
+This is covered in more detail in [revision_summary_report_2025](revision_summary_report_2025.html).
+NHSN has substantial under-reporting behavior that is fairly consistent for any single geo, though there a number of aberrant revisions, some of which change the entire trajectory for a couple of weeks.
+This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly, so handling the revisions would be most useful for prediction, rather than for correcting data for fitting the forecaster.
+We can probably improve our forecasts by incorporating revision behavior.
+
+Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
+We should look into the extra columns to see if it provides useful information for handling revision behavior.
+
+
+## Data substitution
+
+In short, this didn't go well.
+It was a coin toss for covid, and worse than not doing corrections for flu.
+
+## Revision examples
+
+```{r}
+nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_flu$time_type <- "day"
+revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+nhsn_archive_flu %>% autoplot(value, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) +
+  theme(strip.text.x = element_text(size = 8), plot.title = element_text(hjust = 0.5)) +
+  labs(title = "Flu revisions for the highest mean relative spread")
+```
+
+```{r}
+nhsn_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_covid <- nhsn_archive_covid$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_covid$time_type <- "day"
+revision_summary <- nhsn_archive_covid %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+nhsn_archive_covid %>% autoplot(value, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) +
+  theme(strip.text.x = element_text(size = 8), plot.title = element_text(hjust = 0.5)) +
+  labs(title = "Covid revisions for the highest mean relative spread")
+```
+
+
+# Appendix
+## Methods of selecting season phase
+There's a lot of flexibility in this decision.
+Here's some alternatives that we looked at.
+
+```{r}
+flu_gr <- flu_current %>%
+  group_by(geo_value) %>%
+  mutate(gr = growth_rate(value, method = "linear_reg", h = 3)) %>%
+  filter(time_value > "2024-11-01")
+flu_gr %>% autoplot(gr, .facet_by = "geo_value")
+flu_max_dates <- flu_current %>%
+  group_by(geo_value) %>%
+  slice_max(value) %>%
+  select(geo_value, time_value_max = time_value)
+flu_peak_season <- flu_max_dates %>% ungroup() %>% summarize(peak_start = min(time_value_max), peak_end = max(time_value_max)) %>% pivot_longer(cols = c(peak_start, peak_end))
+flu_gr %>%
+  mutate(value = value / max(value)) %>%
+  mutate(neg = gr < 0) %>%
+  group_by(geo_value) %>%
+  arrange(desc(time_value)) %>%
+  mutate(count_so_far = TRUE, count_so_far = cumsum(count_so_far), frac_neg = (cumsum(neg)) / sum(neg)) %>%
+  arrange(geo_value) %>%
+  left_join(flu_max_dates, by = "geo_value") %>%
+  ggplot(aes(x = time_value, y = frac_neg)) +
+  geom_point() +
+  geom_line(aes(y = gr)) +
+  geom_line(aes(y = value)) +
+  geom_vline(data = flu_peak_season, aes(xintercept = value)) +
+  facet_wrap(~geo_value, scale = "free")
+flu_gr %>%
+  left_join(flu_max_dates, by = "geo_value") %>%
+  filter(time_value > time_value_max) %>%
+  arrange(time_value)
+```
+
+```{r}
+flu_within_max <- flu_current %>%
+  filter(time_value > "2024-11-01") %>%
+  group_by(geo_value) %>%
+  mutate(max_val = max(value)) %>%
+  filter(value >= max_val/2) %>%
+  summarize(first_above = min(time_value), last_above = max(time_value))
+flu_current %>%
+  filter(time_value > "2024-11-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(data = flu_within_max, aes(xintercept = first_above)) +
+  geom_vline(data = flu_within_max, aes(xintercept = last_above)) +
+  geom_vline(data = flu_peak_season, aes(xintercept = value, color = "green")) +
+  facet_wrap(~geo_value, scale = "free")
+```
+
+```{r}
+covid_gr <- covid_archive %>%
+  epix_as_of_current() %>%
+  filter(geo_value %nin% c("as", "gu", "mp")) %>%
+  group_by(geo_value) %>%
+  mutate(gr = growth_rate(value, method = "linear_reg", h = 3)) %>%
+  filter(time_value > "2024-09-01")
+covid_gr %>% autoplot(gr, .facet_by = "geo_value")
+```
+
+```{r}
+covid_gr %>%
+  mutate(value = value / max(value)) %>%
+  mutate(neg = gr < 0) %>%
+  group_by(geo_value) %>%
+  arrange(desc(time_value)) %>%
+  mutate(count_so_far = TRUE, count_so_far = cumsum(count_so_far), frac_neg = (cumsum(neg)) / sum(neg)) %>%
+  arrange(geo_value) %>%
+  ggplot(aes(x = time_value, y = frac_neg)) +
+  geom_point() +
+  geom_line(aes(y = gr)) +
+  geom_line(aes(y = value)) +
+  facet_wrap(~geo_value, scale = "free")
+covid_max_dates <- covid_gr %>%
+  slice_max(gr) %>%
+  select(geo_value, time_value_max = time_value)
+covid_gr %>%
+  left_join(covid_max_dates, by = "geo_value") %>%
+  filter(time_value > time_value_max) %>%
+  arrange(time_value)
+```

From 4967de73046d273a5a8a26e6943466892ae16dd6 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 22 Apr 2025 18:00:55 -0700
Subject: [PATCH 29/62] fix: covid generation dates

---
 scripts/covid_hosp_prod.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index 739a619c..2674c7f2 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -35,7 +35,7 @@ if (!g_backtest_mode) {
   # override this. It should be a Wednesday.
   g_forecast_dates <- round_date(g_forecast_generation_dates, "weeks", week_start = 3)
 } else {
-  g_forecast_generation_dates <- c(as.Date(c("2024-11-22", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
+  g_forecast_generation_dates <- c(as.Date(c("2024-11-20", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
   g_forecast_dates <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 }
 

From ddd1fdbcd0187b470738ec50a04c7c49603c5b63 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 25 Apr 2025 16:46:34 -0500
Subject: [PATCH 30/62] external scores updating, score only shared dates

---
 R/utils.R                                     |  49 ++-
 covid_geo_exclusions.csv                      |  16 +-
 reports/template.md                           |   4 +-
 scripts/covid_hosp_prod.R                     |  27 +-
 scripts/flu_hosp_prod.R                       |  31 +-
 scripts/get_forecast_data.r                   |   4 +-
 .../one_offs/read_covid_forecast_hub_data.jl  |  49 ++-
 scripts/reports/season_summary_2025.Rmd       | 374 ++++++++++++++++--
 8 files changed, 440 insertions(+), 114 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index db5385d0..c4eecfed 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -569,7 +569,7 @@ get_targets_errors <- function(project = tar_path_store(), top_n = 10) {
 #'   wait_seconds = 1,
 #'   fn = pub_covidcast,
 #'   source = "nssp",
-#'   signals= "pct_ed_visits_covid",
+#'   signals = "pct_ed_visits_covid",
 #'   geo_type = "state",
 #'   geo_values = "*",
 #'   time_type = "week"
@@ -606,3 +606,50 @@ validate_epi_data <- function(epi_data) {
 get_bucket_df_delphi <- function(prefix = "", bucket = "forecasting-team-data") {
   aws.s3::get_bucket_df(prefix = prefix, bucket = bucket) %>% tibble()
 }
+
+
+
+#' get the unique shared (geo_value, forecast_date, target_end_date) tuples present for each forecaster in `forecasts`
+get_unique <- function(forecasts) {
+  forecasters <- forecasts %>%
+    pull(forecaster) %>%
+    unique()
+  distinct <- map(
+    forecasters,
+    \(x) forecasts %>%
+      filter(forecaster == x) %>%
+      select(geo_value, forecast_date, target_end_date) %>%
+      distinct()
+  )
+  distinct_dates <- reduce(distinct, \(x, y) x %>% inner_join(y, by = c("geo_value", "forecast_date", "target_end_date")))
+  mutate(
+    distinct_dates,
+    forecast_date = round_date(forecast_date, unit = "week", week_start = 6)
+  )
+}
+
+#' filter the external and local forecasts to just the shared dates/geos
+#' some forecasters have a limited set of geos; we want to include those
+#' anyways, they are `tructated_forecasters`, while the external_forecasts may
+#' have previous years forecasts that we definitely want to exclude via
+#' `season_start`.
+filter_shared_geo_dates <- function(local_forecasts, external_forecasts, season_start = "2024-11-01", trucated_forecasters = "windowed_seasonal_extra_sources") {
+  viable_dates <- inner_join(
+    local_forecasts %>%
+      filter(forecaster %nin% trucated_forecasters) %>%
+      get_unique(),
+    external_forecasts %>%
+      filter(forecast_date > season_start) %>%
+      get_unique(),
+    by = c("geo_value", "forecast_date", "target_end_date")
+  )
+  dplyr::bind_rows(
+    local_forecasts %>%
+      mutate(
+        forecast_date = round_date(forecast_date, unit = "week", week_start = 6)
+      ) %>%
+      inner_join(viable_dates, by = c("geo_value", "forecast_date", "target_end_date")),
+    external_forecasts %>%
+      inner_join(viable_dates, by = c("geo_value", "forecast_date", "target_end_date"))
+  )
+}
diff --git a/covid_geo_exclusions.csv b/covid_geo_exclusions.csv
index ea464255..25a849a3 100644
--- a/covid_geo_exclusions.csv
+++ b/covid_geo_exclusions.csv
@@ -117,14 +117,14 @@ forecast_date,forecaster,geo_value,weight
 ##################
 # feb 12
 ##################
-2025-02-05, all, mp, 0
-2025-02-05, windowed_seasonal, all, 3
-2025-02-05, windowed_seasonal_extra_sources, all, 0.0
-2025-02-05, climate_linear, all, 3
-2025-02-05, linear, all, 0.5
-2025-02-05, linearlog, all, 0
-2025-02-05, climate_base, all, 0
-2025-02-05, climate_geo_agged, all, 0.0
+2025-02-12, all, mp, 0
+2025-02-12, windowed_seasonal, all, 3
+2025-02-12, windowed_seasonal_extra_sources, all, 0.0
+2025-02-12, climate_linear, all, 3
+2025-02-12, linear, all, 0.5
+2025-02-12, linearlog, all, 0
+2025-02-12, climate_base, all, 0
+2025-02-12, climate_geo_agged, all, 0.0
 ##################
 # feb 5
 ##################
diff --git a/reports/template.md b/reports/template.md
index a295761b..b40b3752 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -10,7 +10,9 @@
 
 ## Summary Reports
 
-- [Season Summary](season_summary.html) The other documents are also linked from here
+### 2025
+
+- [Season Summary](season_summary_2025.html) The other documents are also linked from here
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [Revision Behavior](revision_summary_report_2025.html)
 - [Covid's problematic initial forecast](first_day_wrong.html)
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index 2674c7f2..c6d4b8c2 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -429,30 +429,9 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        local_together <- dplyr::bind_rows(!!!.x)
-        # only seek to score on dates that are present both locally and in the
-        # remote
-        viable_dates <- inner_join(
-          local_together %>%
-            select(geo_value, forecast_date) %>%
-            distinct() %>%
-            arrange(forecast_date) %>%
-            mutate(
-              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
-            ),
-          external_forecasts %>%
-            select(geo_value, forecast_date) %>%
-            distinct() %>% filter(forecast_date > "2024-10-01"),
-          by = c("geo_value", "forecast_date")
-        )
-        dplyr::bind_rows(
-          local_together %>%
-            mutate(
-              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
-            ) %>%
-            inner_join(viable_dates, by = c("geo_value", "forecast_date")),
-          external_forecasts %>%
-            inner_join(viable_dates, by = c("geo_value", "forecast_date"))
+        filter_shared_geo_dates(
+          dplyr::bind_rows(!!!.x),
+          external_forecasts
         )
       }
     ),
diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 871094dd..1406357c 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -42,7 +42,7 @@ if (!g_backtest_mode) {
   # override this. It should be a Wednesday.
   g_forecast_dates <- round_date(g_forecast_generation_dates, "weeks", week_start = 3)
 } else {
-  g_forecast_generation_dates <- c(as.Date(c("2024-11-22", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
+  g_forecast_generation_dates <- c(as.Date(c("2024-11-21", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
   g_forecast_dates <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 }
 
@@ -470,30 +470,9 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        local_together <- dplyr::bind_rows(!!!.x)
-        # only seek to score on dates that are present both locally and in the
-        # remote
-        viable_dates <- inner_join(
-          local_together %>%
-            select(geo_value, forecast_date) %>%
-            distinct() %>%
-            arrange(forecast_date) %>%
-            mutate(
-              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
-            ),
-          external_forecasts %>%
-            select(geo_value, forecast_date) %>%
-            distinct() %>% filter(forecast_date > "2024-10-01"),
-          by = c("geo_value", "forecast_date")
-        )
-        dplyr::bind_rows(
-          local_together %>%
-            mutate(
-              forecast_date = ceiling_date(forecast_date, unit = "week", week_start = 6)
-            ) %>%
-            inner_join(viable_dates, by = c("geo_value", "forecast_date")),
-          external_forecasts %>%
-            inner_join(viable_dates, by = c("geo_value", "forecast_date"))
+        filter_shared_geo_dates(
+          dplyr::bind_rows(!!!.x),
+          external_forecasts
         )
       }
     ),
@@ -504,7 +483,7 @@ if (g_backtest_mode) {
         nhsn_latest_end_of_week <-
           nhsn_latest_data %>%
           mutate(
-            time_value = ceiling_date(time_value, unit = "week", week_start = 6)
+            time_value = round_date(time_value, unit = "week", week_start = 6)
           )
         score_forecasts(nhsn_latest_end_of_week, joined_forecasts_and_ensembles, "flu")
       }
diff --git a/scripts/get_forecast_data.r b/scripts/get_forecast_data.r
index 6606984a..ea7c449d 100644
--- a/scripts/get_forecast_data.r
+++ b/scripts/get_forecast_data.r
@@ -149,6 +149,6 @@ fetch_forecast_files <- function(sync_to_s3 = TRUE, disease) {
 }
 
 cli::cli_alert_info("Fetching COVID forecasts {run_time_local} (UTC: {run_time})")
-fetch_forecast_files(sync_to_s3 = FALSE, disease = "covid")
+covid_forecasts <- fetch_forecast_files(disease = "covid")
 cli::cli_alert_info("Fetching FLU forecasts {run_time_local} (UTC: {run_time})")
-fetch_forecast_files(sync_to_s3 = FALSE, disease = "flu")
+flu_forecasts <- fetch_forecast_files(disease = "flu")
diff --git a/scripts/one_offs/read_covid_forecast_hub_data.jl b/scripts/one_offs/read_covid_forecast_hub_data.jl
index 327bd2a8..eb2c019c 100644
--- a/scripts/one_offs/read_covid_forecast_hub_data.jl
+++ b/scripts/one_offs/read_covid_forecast_hub_data.jl
@@ -1,9 +1,11 @@
 # this was run from within the https://github.com/reichlab/covid19-forecast-hub repo,
-# specifically in the data-processed folder
+# specifically in the model-output folder
+# cd("../../../covid19-forecast-hub/model-output")
+# if started here
 # to get the rds, run
 #
-# full_results <- readr::read_csv("../covid19-forecast-hub/data-processed/covid19-2023season-results.csv")
-# aws.s3::s3save(full_results, object = "covid19_forecast_hub_2023_full_summed.rds", bucket = "forecasting-team-data")
+# full_results <- readr::read_csv(here::here("cache/covid19-2024season-results.csv"))
+# aws.s3::s3save(full_results, object = "covid19_forecast_hub_2024_full.rds", bucket = "forecasting-team-data")
 #
 using Base: floatrange
 using CSV
@@ -13,13 +15,14 @@ using Dates
 using RData
 import Base.lowercase
 pwd()
-res = CSV.read("COVIDhub_CDC-ensemble/2023-10-02-COVIDhub_CDC-ensemble.csv", DataFrame)
-pathname = "COVIDhub_CDC-ensemble/"
-filename = "2023-10-02-COVIDhub_CDC-ensemble.csv"
-state_names = CSV.read("../data-locations/locations.csv", DataFrame)
+res = CSV.read("CovidHub-ensemble/2024-11-23-CovidHub-ensemble.csv", DataFrame)
+pathname = "CovidHub-ensemble"
+filename = "2024-11-23-CovidHub-ensemble.csv"
+state_names = CSV.read("../auxiliary-data/locations.csv", DataFrame)
 lowercase(m::Missing) = m
 @rtransform! state_names @passmissing :abbreviation = lowercase(:abbreviation)
 @select! state_names :abbreviation :location
+format_file(pathname, filename, state_names)
 function format_file(pathname, filename, state_names)
     if length(filename) < 10 ||
        match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", filename[1:10]) == nothing ||
@@ -28,23 +31,29 @@ function format_file(pathname, filename, state_names)
     end
     println(joinpath(pathname, filename))
     res = CSV.read(joinpath(pathname, filename), DataFrame, missingstring="NA", types=Dict("value" => Float64))
-    if !("forecast_date" in names(res)) ||
-       res[!, :forecast_date] |> minimum < Date(2023, 1, 1)
+    if "forecast_date" in names(res)
+        @rename! res :reference_date = :forecast_date
+    end
+    if !("reference_date" in names(res)) ||
+       (res[!, :reference_date] |> minimum) < Date(2023, 1, 1)
         return DataFrame()
     end
-    @transform(res, :target = (:target))
     res = @chain res begin
-        @rtransform :target = parse(Int64, match(r"[0-9]*", :target).match)
+        # old format problem, ahead is now recorded elsewhere
+        #@rtransform :target = parse(Int64, match(r"[0-9]*", :target).match)
         @transform :forecaster = pathname[3:end]
-        @rsubset :type == "quantile"
+        @rsubset :output_type == "quantile"
     end
     res = leftjoin(res, state_names, on=:location)
-    @select! res :forecaster :geo_value = :abbreviation :forecast_date :target_end_date :ahead = :target :quantile :value
-    @chain res begin
-        @rtransform :week_ahead = div(:ahead, 7)
-        @groupby :forecaster :geo_value :forecast_date :week_ahead :quantile
-        @combine :value = sum(:value)
-    end
+    names(res)
+    res[!, :output_type_id]
+    @select res :forecaster :geo_value = :abbreviation :forecast_date = :reference_date :target_end_date :ahead = :horizon :quantile = :output_type_id :value
+    # this is for converting daily forecasts into weekly, whereas this script is currently downloading weekly forecasts
+    #@chain res begin
+    #    @rtransform :week_ahead = div(:ahead, 7)
+    #    @groupby :forecaster :geo_value :reference_date :week_ahead :quantile
+    #    @combine :value = sum(:value)
+    #end
 end
 results = DataFrame[]
 for (root, dirs, files) in walkdir(".")
@@ -52,5 +61,7 @@ for (root, dirs, files) in walkdir(".")
         push!(results, format_file(root, file, state_names))
     end
 end
+maximum(size.(results, 2))
 full_results = vcat(results...)
-CSV.write("covid19-2023season-results.csv", full_results)
+CSV.write("../../exploration-tooling/cache/covid19-2024season-results.csv", full_results)
+pwd()
diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 42be4b3b..4f34886b 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -24,7 +24,7 @@ body .main-container {
 
 $$\\[.4in]$$
 
-```{r echo=FALSE}
+```{r echo=FALSE, warning=FALSE,message=FALSE}
 knitr::opts_chunk$set(
   fig.align = "center",
   message = FALSE,
@@ -35,30 +35,148 @@ ggplot2::theme_set(ggplot2::theme_bw())
 source(here::here("R/load_all.R"))
 ```
 
-# Season Scoring
-<!-- NOTE: to run this, you must run `make prod-flu` and `make prod-covid` successfully, as it will read the targets directly from the store. -->
-
 ```{r setup, include=FALSE}
 library(DT)
 
 # Define aggregation functions
 Mean <- function(x) mean(x, na.rm = TRUE)
 GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
-flu_scores <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores"))
+flu_scores <-
+  qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores")) %>%
+  mutate(forecaster = case_match(
+           forecaster,
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = forecaster
+    ))
+flu_scores %>% distinct(forecaster)
+
+covid_bucket <- s3read_using(
+  nanoparquet::read_parquet,
+  object = glue::glue("exploration/2024-2025_covid_hosp_forecasts.parquet"), bucket = "forecasting-team-data"
+)
+covid_bucket %>% distinct(forecast_date) %>% arrange(desc(forecast_date))
+covid_forecasts %>% distinct(forecast_date) %>% arrange(desc(forecast_date))
+flu_forecasts <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "joined_forecasts_and_ensembles")) %>%
+  mutate(forecaster = case_match(
+           forecaster,
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = forecaster
+    ))
+
 forecast_dates <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "forecast_dates"))
-covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
+covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores")) %>%
+  mutate(forecaster = case_match(
+           forecaster,
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = forecaster
+    ))
+covid_scores %>% distinct(forecaster)
+
 ```
+# Models used
+One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
+We can split the models and ensembles into 3 categories: the ad-hoc models that we created in response to the actual data that we saw, the AR models that we had been backtesting, and the ensembles.
+
+### The "ad-hoc" models
+
+- `climate_base` uses a 7 week window around the target and forecast date to establish quantiles. 
+  `climate_base` does this separately for each geo
+- `climate_geo_agged` on the other hand converts to rates, pools all geos, computes quantiles using similar time windows, and then converts back to counts.
+  There is effectively only one prediction, scaled to fit each geo. 
+- `linear` does a linear extrapolation of the last 4 weeks of data on a rates scale.
+  Initially it had an intercept, but this was removed when it caused the model to not reproduce the -1 ahead data exactly.
+  This change was made on Jan 8th, in the commit with hash 5f7892b.
+  The quantiles are ad-hoc; the residuals are pooled, symmetrized, truncated using some bounds hand-selected to set the quantiles at a reasonable width, and then propagated forward using `propagate_samples` from epipredict.
+- `climate_linear` combines the `climate_*` models with the `linear` model.
+  It does two linear weightings between the linear model and the climate models.
+  As the ahead goes from -1 to 4, it linearly interpolates between a 5% weight on the climate model and a 90% weight on the climate model (so the furthest ahead is mostly a climate model).
+  At the same time, as the quantile level goes further away from the median, it interpolates between a 10% weight on the climate model at the median and a 100% weight on the climate model at either the 1% or 99% quantile levels.
+  In net, at the median -1 ahead, `climate_linear` has a weight .5%.
+  
+### The AR models
+
+- `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from an 8 week window from each year.
+  It does quartic root scaling along with quantile and median whitening.
+  For flu, this augments with ili and flusurv (so they are added as additional rows, with their own scaling/centering).
+  Covid doesn't have a comparable dataset.
+  In addition to dropping the first 2 seasons, the windowed models drop the summers for the purposes of determining whitening behavior.
+- `windowed_seasonal_nssp` is like `windowed_seasonal`, but also has `nssp` as an exogenous component.
+  Note that for flu, this effectively means throwing out the ili and flusurv data, since `nssp` is only defined recently.
+  For covid, `windowed_seasonal_nssp` is effectively the same model, but with auxiliary data.
+
+### The general ensembles
+
+- `ensemble_windowed` combines the `windowed_seasonal` and `windowed_seasonal_nssp` in a simple half and half ensemble.
+  One would expect this to be more helpful for Flu than Covid, since they have different information available.
+- `retro_submission` is a retroactive recreation of `CMU-TimeSeries` using updated methods (`linear` always matching the most recent value, for example).
+  The weights for the various models can be found in [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv).
+  These can vary on a state by state basis.
+- `CMU-TimeSeries` is what we actually submitted. 
+  This is a moving target that has changed a number of times. For a detailed list of the weights used, see [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv) for specific weights.
+  
+  <details>
+  <summary> A timeline of the changes to `CMU-timeseries` </summary>
+  ```{r, echo=FALSE}
+  tribble(~Date, ~`Change for flu`, ~`Change for covid`,
+          as.Date("2024-11-21"), "Initial forecast. Uses a simple average of the `climate_base` and the `linear` models.", "Same model as Flu",
+          as.Date("2024-11-27"), "Start using the `climate_linear` model", "start using the `climate_linear` model",
+          as.Date("2024-12-04"), "-", "-",
+          as.Date("2024-12-11"), "Introduction of `windowed_seasonal` model", "model remains just `climate_linear`",
+          as.Date("2024-12-18"), "-", "-",
+          as.Date("2024-12-25"), "-", "-",
+          as.Date("2025-01-01"), "-", "-",
+          as.Date("2025-01-08"), "`linear` no longer has an intercept", "same change",
+          as.Date("2025-01-15"), "`windowed_seasonal_nssp` introduced to ensemble", "-",
+          as.Date("2025-01-22"), "-", "`windowed_seasonal_nssp` introduced to ensemble",
+          as.Date("2025-01-29"), "-", "-",
+          as.Date("2025-02-05"), "-", "`windowed_seasonal` introduced to ensemble",
+          as.Date("2025-02-12"), "-", "-",
+          as.Date("2025-02-19"), "-", "`windowed_seasonal` removed from ensemble",
+          as.Date("2025-02-26"), "-", "`windowed_seasonal` added to ensemble only for states where `nssp` is missing",
+          as.Date("2025-03-05"), "same from here on", "same from here on",
+          as.Date("2025-03-12"), "-", "-",
+          as.Date("2025-03-19"), "-", "-",
+          as.Date("2025-03-26"), "-", "-",
+          as.Date("2025-04-02"), "-", "-",
+          as.Date("2025-04-09"), "-", "-",
+          as.Date("2025-04-16"), "-", "-",
+          as.Date("2025-04-23"), "-", "-",
+          ) %>%
+    datatable(options = list(pageLength=100))
+  ```
+  </details>
 
-Some overall conclusions here about the scores for the season.
+# Season Scoring
+<!-- NOTE: to run this, you must run `make prod-flu` and `make prod-covid` successfully, as it will read the targets directly from the store. -->
 
-## Flu Scores
+In addition to the plots below, it is worth keeping in mind the all model comparisons from [flu eval dashboard](https://reichlab.io/flusight-dashboard/eval.html) and [covid](https://reichlab.io/covidhub-dashboard/eval.html).
+
+For Flu, the best wis-scoring model there is `PSI-PROF` with a mean WIS of 128.6 vs the ensemble's 140.8 and `CMU-TimeSeries`'s 139.7[^1].
+The best MAE-scoring model there is `CEPH-Rtrend_fluH`, with a mean MAE of 187.4 vs the ensemble's 196.6 and `CMU-TimeSeries`'s 197.8.
+Most models are bad at getting 95% coverage, suggesting most teams have too narrow of extreme quantiles.
+50% coverage is more common, with about a quarter of forecasters being within a 40-60% range (including us).
 
-Some conclusions here about the scores for the season.
+For Covid, there are far fewer models submitted overall.
+The best wis-scoring model is actually just the ensemble at 35.2, with the next-best being `UMass-ar6_pooled` at 37.8, compareed to `CMU-TimeSeries` at 44.8[^2].
+Coverage in covid is somewhat better, though a larger fraction of teams are within +/-10% of 95% coverage; we specifically got within 1%.
+Like with flu, there was systematic under-coverage though, so the models are also biased towards too small of intervals for the 95% band.
+The 50% coverage is likewise more accurate than for flu, with most forecasts within +/-10%.
+`CMU-TimeSeries` is at 52.7%, so slightly over. 
+Generally, more teams were under 50% coverage than over, so there is also a systemic bias towards under-coverage in covid.
 
+## Flu Scores
 
 Before we get into the actual scores, we need to define how we go about creating 3 different phases.
-For the details, see the fold.
 They are `increasing`, `peak`, and `decreasing`.
+Roughly, `peak` is the interval where the value is within 50% of the max and the other two are before and after.
+For the details, see the fold.
+
 <details>
   <summary> Splitting the season </summary>
 ### Splitting the season
@@ -98,10 +216,11 @@ classify_phase <- function(time_value, first_above, last_above, rel_duration, th
     .default = "peak"
   ) %>% factor(levels = c("increasing", "peak", "decreasing", "flat"))
 }
+covid_flat_threshold <- 0.6
 flu_within_max <- compute_peak_season(flu_current)
 sanity_check_classifying <- flu_current %>%
   left_join(flu_within_max, by = "geo_value") %>%
-  mutate(phase = classify_phase(time_value, first_above, last_above, rel_duration, 0.6)) %>%
+  mutate(phase = classify_phase(time_value, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   group_by(geo_value) %>%
   distinct(phase)
 ```
@@ -120,36 +239,78 @@ flu_current %>%
 There is a wide variety of length for the peak by this definition, but it does seem to naturally reflect the difference in dynamics.
 `ok` is quite short for example, because it has a simple clean peak, whereas `or` has literally 2 peaks with the same height, so the entire interval between them is classified as peak.
 
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak. 
+First, for the start:
+
+```{r}
+covid_within_max %>%
+  filter(rel_duration < covid_flat_threshold) %>%
+  pull(first_above) %>%
+  summary()
+```
+
+So the `increasing` phase ends at earliest on November 2nd, on average on December 21st, and at the latest on January 4th.
+
+```{r}
+covid_within_max %>%
+  filter(rel_duration < covid_flat_threshold) %>%
+  pull(last_above) %>%
+  summary()
+```
+
+Similarly, the `peak` phase ends at the earliest on January the 18th, on average on February 15th, and at the latest on April 5th.
+
 </details>
+
 ### Forecaster Scores for Flu:  {.tabset}
 
 Forecast dates: `r forecast_dates`
 
 #### Scores Aggregated By Forecaster
+`geomean` here uses an offset of the smallest non-zero wis score for that forecaster (accounting for floating point zeros).
+Generally there are far too few to have a major effect (something like 2% of the scores).
+The standard deviation for a given forecaster is significantly larger than the actual mean, so we should avoid drawing too many conclusions from these overall scores.
+
+`mean_pop_norm_wis` and `mean_pop_norm_ae` are on a rate per 100,000.
 
 ```{r, fig.height = 60, fig.width = 12, echo=FALSE}
-flu_scores %>%
+# set the offset to be the minimum non-zero score
+flu_score_summary <- flu_scores %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster) %>%
+  mutate(
+    min_wis = min(wis[wis > 1e-5]),
+    min_ae = min(ae_median[ae_median > 1e-5])
+  ) %>%
   summarize(
     mean_wis = round(Mean(wis), 2),
-    geomean_wis = round(GeoMean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geo_wis = round(GeoMean(wis, min_wis), 2),
+    #nWISzero = sum(wis < 1e-5),
     mean_ae = round(Mean(ae_median), 2),
-    geomean_ae = round(GeoMean(ae_median), 2),
-    mean_coverage_90 = round(Mean(interval_coverage_90), 2),
+    pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    geo_ae = round(GeoMean(ae_median, min_ae), 2),
+    #nAEzero = sum(ae_median < 1e-5),
+    mean_cov_90 = round(Mean(interval_coverage_90), 2),
     n = n()
   ) %>%
-  rename(id = forecaster) %>%
+  rename(id = forecaster)
+flu_score_summary %>%
   datatable()
 ```
 
 #### Scores Aggregated By Phase
+Note that the standard deviation is frequently double the actual value, much like in the totally general case.
 
 ```{r, fig.height = 8, fig.width = 12, echo=FALSE}
-phase_scores <- flu_scores %>%
+phase_scores <-
+  flu_scores %>%
   left_join(flu_within_max, by = "geo_value") %>%
-  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 0.6)) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   group_by(forecaster, phase) %>%
   summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
+    ae_sd = sd(ae_median, na.rm = TRUE),
     across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
     n = n(),
     .groups = "drop"
@@ -169,11 +330,15 @@ ggplotly(p)
 agg_flu <- flu_scores %>%
   filter(forecast_date > "2024-10-01") %>%
   filter(forecast_date != as.Date("2025-01-25")) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster, forecast_date) %>%
   summarize(
     mean_wis = round(Mean(wis), 2),
+    mean_pop_norm_wis = round(Mean(wis *1e5/pop), 2),
     geomean_wis = round(GeoMean(wis), 2),
     mean_ae = round(Mean(ae_median), 2),
+    mean_pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    wis_sd = sd(wis),
     geomean_ae = round(GeoMean(ae_median), 2),
     mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
   )
@@ -210,10 +375,60 @@ p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
 ggplotly(p)
 ```
 
+#### Score histograms
+
+The standard deviation is far too large to actually include it in any of the previous graphs and tables.
+It is routinely as large as the mean value itself.
+To try to represent this, in this tab we have the histogram of the wis, split by phase and forecaster.
+Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
+Even after normalizing by population, there is a large variation in scale for the scores.
+
+Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
+`climate_geo_agged` is flatter overall than `ens_ar_only`
+
+```{r, fig.height = 20, fig.width = 13, echo=FALSE}
+#, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
+flu_scores %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(wis = wis * 1e5/pop) %>%
+  mutate(pop = factor(pop)) %>%
+  group_by(forecaster) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
+  geom_histogram(bins = 70) +
+  facet_grid(forecaster~ phase) +
+  labs(title = "Wis score histogram") +
+  ylab("count") +
+  xlab("wis, population normalized") +
+  theme(plot.title = element_text(hjust = 0.5), legend.position = "none") +
+  scale_color_viridis_d()
+```
+
+### Results
+
+Either by `mean_wis` or `mean_ae`, `CMU-TimeSeries`, `FluSight-ensemble`, and `windowed_seasonal_nssp` all have similarly good performance, with `ens_ar_only`, `retro_submission`, and `windowed_seasonal` all within 10 cases of the best performing.
+On a population normalized basis, `windowed_seasonal_nssp` stands out somewhat, but not in an absolutely stunning way.
+
+Using `mean_cov_90`, most of these models have quantiles that are too narrow; only `climate_geo_agged` actually has a 90% coverage, and it primarily achieves this through having quantiles that are several times larger than the largest value for that geo.
+The best performing models range from 76% to 83%.
+
+Breaking up the scoring by phase, those forecasters all cluster together, with only `FluSight-baseline`, `linear`, and both `climate` only models having appreciably worse scores.
+All of the models do ~twice as worse at the peak as during either the `increasing` or `decreasing` phases, with most models doing marginally better during the `decreasing` phase than the `increasing` phase.
+It is worth noting that phase doesn't correspond to just grouping the dates, because different geographies enter a new phase at different times.
+
+Factoring by ahead, the models that include an AR component generally degrade with ahead less badly. 
+Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase (after the -1 ahead where it typically has exact data).
+
+
+That said, take a look at the Score histograms tab; all of these forecasters have a *wide* variation in actual score values, with the standard deviation frequently larger than the mean value.
+
+
+
 
 ## Covid Scores
 
-Some conclusions here about the scores for the season.
+Overall, the best covid forecaster is `windowed_seasonal_extra_sources`, which uses a window of data around the given time period
 
 One peculiar thing about Covid scoring: the first day has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab below).
 This mostly comes from the first week having larger revisions than normal.
@@ -271,12 +486,12 @@ How to handle this?
 One option is to include a separate phase for no season that applies to the entire `geo_value` if more than half of the `time_value`s are within 50% of the peak:
 
 ```{r}
-no_phase <- covid_within_max %>% arrange(desc(rel_duration)) %>% filter(rel_duration > 0.6)
+no_phase <- covid_within_max %>% arrange(desc(rel_duration)) %>% filter(rel_duration > covid_flat_threshold)
 no_phase %>% arrange(rel_duration)
 ```
 
 which is 27 out of our 53 geos.
-0.6 is admittedly a pretty arbitrary cut-off, chosen so that `geo_value`s with high `rel_duration`s which still appear to have a clear peak, such as `de`, `us`, `wy`, and `mi` aren't assigned to `flat`.
+`r covid_flat_threshold` is admittedly a pretty arbitrary cut-off, chosen so that `geo_value`s with high `rel_duration`s which still appear to have a clear peak, such as `de`, `us`, `wy`, and `mi` aren't assigned to `flat`.
 We can probably decrase this filter as we move later into the season and locations which are still decreasing finish dropping.
 As a sanity check, let's plot just these locations to confirm that we're not pulling in geos with actual peaks
 
@@ -290,9 +505,33 @@ covid_current %>%
   theme(legend.position = "none")
 ```
 
+Possible exceptions:
+
+- `pa` unfortunately does seem to have a season, but because it has an early wave, the interval counted as `peak` is too wide. Hopefully as the season actually concludes this will go away.
+- `nc` has a weak peak, but it has only recently declined below the 50% mark. It is likely it will be reclassified after the season is actually over.
+
+There are several locations such as `al` and `ar` which don't have a peak so much as an elevated level for approximately the entire period.
+This is awkward to handle for this classification.
+
+Finally, like for Flu we should examine a summary of the start/end dates for the peak of the season.
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak. 
+First, for the start:
+
+```{r}
+covid_within_max$first_above %>% summary()
+```
+
+So the `increasing` phase ends at earliest on December 28st, on average on January 18th, and at the latest on April 19th.
+Which suggests
 
+```{r}
+covid_within_max$last_above %>% summary()
+```
+
+Similarly, the `peak` phase ends at the earliest on the 11th of December, on average on the first of March, and at the latest on March 22nd.
 
 </details>
+
 ### Forecaster Scores for Covid:  {.tabset}
 
 Forecast dates: `r forecast_dates`
@@ -301,15 +540,23 @@ Forecast dates: `r forecast_dates`
 
 ```{r, fig.height = 60, fig.width = 12, echo=FALSE}
 covid_scores %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster) %>%
+  mutate(
+    min_wis = min(wis[wis > 1e-5]),
+    min_ae = min(ae_median[ae_median > 1e-5])
+  ) %>%
   summarize(
     mean_wis = round(Mean(wis), 2),
-    geomean_wis = round(GeoMean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis, min_wis), 2),
     mean_ae = round(Mean(ae_median), 2),
-    geomean_ae = round(GeoMean(ae_median), 2),
+    pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    geomean_ae = round(GeoMean(ae_median, min_ae), 2),
     mean_coverage_90 = round(Mean(interval_coverage_90), 2),
     n = n()
-  ) %>%
+  ) %>% 
+  arrange(mean_wis) %>%
   rename(id = forecaster) %>%
   datatable()
 ```
@@ -319,7 +566,30 @@ covid_scores %>%
 ```{r, fig.height = 8, fig.width = 12, echo=FALSE}
 phase_scores <- covid_scores %>%
   left_join(covid_within_max, by = "geo_value") %>%
-  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 0.6)) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  geom_point() +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Phase (no flat)
+
+Since the `flat` classification is somewhat ambiguous, we should also bin everything as we otherwise would.
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <- covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 1)) %>%
   group_by(forecaster, phase) %>%
   summarize(
     across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
@@ -334,6 +604,7 @@ p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = fo
 
 ggplotly(p)
 ```
+
 #### Scores Aggregated By Forecast Date
 
 ```{r, fig.height = 8, fig.width = 12, echo=FALSE}
@@ -381,12 +652,48 @@ p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
 ggplotly(p)
 ```
 
+
+#### Score histograms
+
+The standard deviation is far too large to actually include it in any of the previous graphs and tables meaningfully.
+It is routinely larger than the wis value itself.
+Like with Flu, in this tab we have the histogram of the wis, split by phase and forecaster.
+Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
+Even after normalizing by population, there is a variation in scale for the scores.
+
+Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
+`climate_geo_agged` is flatter overall than `ens_ar_only`
+
+```{r, fig.height = 20, fig.width = 13, echo=FALSE}
+#, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
+covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(wis = wis * 1e5/pop) %>%
+  mutate(pop = factor(pop)) %>%
+  group_by(forecaster) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
+  geom_histogram(bins = 120) +
+  facet_grid(forecaster~ phase) +
+  labs(title = "Wis score histogram") +
+  ylab("count") +
+  xlab("wis, population normalized") +
+  theme(plot.title = element_text(hjust = 0.5), legend.position = "none") +
+  scale_color_viridis_d()
+```
+
+### Results
+
+
+
 # Revision behavior and data substitution
 
 This is covered in more detail in [revision_summary_report_2025](revision_summary_report_2025.html).
 NHSN has substantial under-reporting behavior that is fairly consistent for any single geo, though there a number of aberrant revisions, some of which change the entire trajectory for a couple of weeks.
-This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly, so handling the revisions would be most useful for prediction, rather than for correcting data for fitting the forecaster.
-We can probably improve our forecasts by incorporating revision behavior.
+This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly. 
+Because of the speed in revision behavior, it matters only for prediction, rather than for correcting data for fitting the forecaster.
+We can probably improve our forecasts by incorporating revision behavior for both nhsn and nssp.
 
 Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
 We should look into the extra columns to see if it provides useful information for handling revision behavior.
@@ -469,12 +776,6 @@ flu_gr %>%
 ```
 
 ```{r}
-flu_within_max <- flu_current %>%
-  filter(time_value > "2024-11-01") %>%
-  group_by(geo_value) %>%
-  mutate(max_val = max(value)) %>%
-  filter(value >= max_val/2) %>%
-  summarize(first_above = min(time_value), last_above = max(time_value))
 flu_current %>%
   filter(time_value > "2024-11-01") %>%
   autoplot(value, .facet_by = "geo_value") +
@@ -515,3 +816,10 @@ covid_gr %>%
   filter(time_value > time_value_max) %>%
   arrange(time_value)
 ```
+
+[^1]: this is off from our local version of the scoring by .6, which is nonzero but not appreciably different.
+      It's scored on N=4160 vs the local 3692, which probably comes down to negative aheads.
+      Note that both "bests" in this paragraph are ignoring models which have far fewer submission values, since they're likely to be unrepresentative.
+
+[^2]: this is further off both in absolute and further yet in relative terms from our local scoring, which has `CMU-TimeSeries` at 46.32 rather than 44.8. 
+      It's unclear why; there are 3952 samples scored on the remote vs 3692 locally, so ~300 scored there that we don't score where we apparently did better.

From 80ab82e31333595131e4aa494fb820d3090492ad Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 15 Apr 2025 15:51:57 -0700
Subject: [PATCH 31/62] hotfix: april 9 data tweaks

---
 covid_geo_exclusions.csv | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/covid_geo_exclusions.csv b/covid_geo_exclusions.csv
index 25a849a3..1cc940d0 100644
--- a/covid_geo_exclusions.csv
+++ b/covid_geo_exclusions.csv
@@ -12,6 +12,18 @@ forecast_date,forecaster,geo_value,weight
 2024-10-01, climate_geo_agged, all, 0.5
 2024-10-01, climate_quantile_extrapolated, all, 0
 ##################
+# April 9
+##################
+2025-04-02, all, mp, 0
+2025-04-02, windowed_seasonal, all, 0.0001
+2025-04-02, windowed_seasonal_extra_sources, all, 3
+2025-04-02, climate_linear, all, 0.0001
+2025-04-02, linear, all, 3
+2025-04-02, linearlog, all, 0
+2025-04-02, climate_base, all, 2
+2025-04-02, climate_geo_agged, all, 0.5
+2025-04-02, climate_quantile_extrapolated, all, 0
+##################
 # April 2
 ##################
 2025-04-02, all, mp, 0

From f79c22ba5456e5e97b4498764bde637b655714e0 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 15 Apr 2025 16:01:36 -0700
Subject: [PATCH 32/62] doc: make run.R more correct about env vars

---
 scripts/run.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run.R b/scripts/run.R
index 956b61c1..44481101 100644
--- a/scripts/run.R
+++ b/scripts/run.R
@@ -40,7 +40,7 @@ covid_submission_directory <- Sys.getenv("COVID_SUBMISSION_DIRECTORY", "cache")
 cli::cli_inform(
   c(
     "i" = "Reading environment variables...",
-    "*" = "TAR_PROJECT = {tar_project}",
+    "*" = "TAR_RUN_PROJECT = {tar_project}",
     "*" = "AWS_S3_PREFIX = {aws_s3_prefix}",
     "*" = "FLU_SUBMISSION_DIRECTORY = {flu_submission_directory}",
     "*" = "COVID_SUBMISSION_DIRECTORY = {covid_submission_directory}"

From 69fd4cc199c1c4e61a6f794ec36338d1bbe7f24e Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 28 Apr 2025 10:45:41 -0500
Subject: [PATCH 33/62] toc, various minor notes

---
 scripts/reports/season_summary_2025.Rmd | 101 +++++++++++++++---------
 1 file changed, 62 insertions(+), 39 deletions(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 4f34886b..82d26f0e 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -4,6 +4,7 @@ date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
 output:
   html_document:
     code_folding: hide
+    toc: True
 editor_options:
   chunk_output_type: console
 ---
@@ -50,23 +51,13 @@ flu_scores <-
     "ens_ar_only" ~ "ensemble_windowed",
     .default = forecaster
     ))
-flu_scores %>% distinct(forecaster)
-
-covid_bucket <- s3read_using(
-  nanoparquet::read_parquet,
-  object = glue::glue("exploration/2024-2025_covid_hosp_forecasts.parquet"), bucket = "forecasting-team-data"
-)
-covid_bucket %>% distinct(forecast_date) %>% arrange(desc(forecast_date))
-covid_forecasts %>% distinct(forecast_date) %>% arrange(desc(forecast_date))
-flu_forecasts <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "joined_forecasts_and_ensembles")) %>%
-  mutate(forecaster = case_match(
-           forecaster,
+flu_forecasts <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "joined_forecasts_and_ensembles"))
+flu_forecasts$forecaster %<>% case_match(
     "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
     "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
     "ens_ar_only" ~ "ensemble_windowed",
-    .default = forecaster
-    ))
-
+    .default = flu_forecasts$forecaster
+    )
 forecast_dates <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "forecast_dates"))
 covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores")) %>%
   mutate(forecaster = case_match(
@@ -76,11 +67,11 @@ covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
     "ens_ar_only" ~ "ensemble_windowed",
     .default = forecaster
     ))
-covid_scores %>% distinct(forecaster)
-
 ```
+
 # Models used
 One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
+For both flu and covid they are either unusually large or unusually small, and don't warrant inclusion.
 We can split the models and ensembles into 3 categories: the ad-hoc models that we created in response to the actual data that we saw, the AR models that we had been backtesting, and the ensembles.
 
 ### The "ad-hoc" models
@@ -97,8 +88,26 @@ We can split the models and ensembles into 3 categories: the ad-hoc models that
   It does two linear weightings between the linear model and the climate models.
   As the ahead goes from -1 to 4, it linearly interpolates between a 5% weight on the climate model and a 90% weight on the climate model (so the furthest ahead is mostly a climate model).
   At the same time, as the quantile level goes further away from the median, it interpolates between a 10% weight on the climate model at the median and a 100% weight on the climate model at either the 1% or 99% quantile levels.
-  In net, at the median -1 ahead, `climate_linear` has a weight .5%.
-  
+  In net, at the median -1 ahead, the climate models have a weight of 0.5%, and the linear model of 99.5%.
+
+<details>
+<summary> A plot of the `climate` weights </summary>
+
+```{r climate_weight_plot, fig.width = 12, fig.height = 3}
+weights <-
+  make_ahead_weights(-1:3) %>%
+  left_join(
+    make_quantile_weights(covidhub_probs()),
+    by = c("forecast_family"),
+    relationship = "many-to-many"
+  ) %>%
+  mutate(weight = weight.x * weight.y) %>%
+  select(forecast_family, quantile, ahead, weight)
+weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead), y = factor(quantile), fill = weight)) + geom_tile() + scale_fill_viridis_c(limits = c(0,1))
+```
+
+</details>
+
 ### The AR models
 
 - `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from an 8 week window from each year.
@@ -122,7 +131,7 @@ We can split the models and ensembles into 3 categories: the ad-hoc models that
   
   <details>
   <summary> A timeline of the changes to `CMU-timeseries` </summary>
-  ```{r, echo=FALSE}
+  ```{r cmu_timeseries_timeline, echo=FALSE}
   tribble(~Date, ~`Change for flu`, ~`Change for covid`,
           as.Date("2024-11-21"), "Initial forecast. Uses a simple average of the `climate_base` and the `linear` models.", "Same model as Flu",
           as.Date("2024-11-27"), "Start using the `climate_linear` model", "start using the `climate_linear` model",
@@ -191,7 +200,7 @@ There's a great deal of ambiguity in defining the phase however; to keep it simp
 2 is the most ambiguous of these, since sometimes there is a clean peak, and sometimes there are multiple peaks.
 To do this simply, let's see what seasons we get if we use "above 50% of the peak value" to define phase 2.
 
-```{r}
+```{r split_season_functions}
 flu_archive <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
 flu_current <- flu_archive %>%
   epix_as_of_current() %>%
@@ -217,6 +226,7 @@ classify_phase <- function(time_value, first_above, last_above, rel_duration, th
   ) %>% factor(levels = c("increasing", "peak", "decreasing", "flat"))
 }
 covid_flat_threshold <- 0.6
+flu_flat_threshold <- 0.9
 flu_within_max <- compute_peak_season(flu_current)
 sanity_check_classifying <- flu_current %>%
   left_join(flu_within_max, by = "geo_value") %>%
@@ -226,7 +236,7 @@ sanity_check_classifying <- flu_current %>%
 ```
 
 
-```{r, fig.width = 15, fig.height = 15}
+```{r flu_season_definitions, fig.width = 15, fig.height = 15}
 flu_current %>%
   filter(time_value > "2024-11-01") %>%
   autoplot(value, .facet_by = "geo_value") +
@@ -242,18 +252,18 @@ There is a wide variety of length for the peak by this definition, but it does s
 Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak. 
 First, for the start:
 
-```{r}
-covid_within_max %>%
-  filter(rel_duration < covid_flat_threshold) %>%
+```{r flu_peak_start}
+flu_within_max %>%
+  filter(rel_duration < flu_flat_threshold) %>%
   pull(first_above) %>%
   summary()
 ```
 
 So the `increasing` phase ends at earliest on November 2nd, on average on December 21st, and at the latest on January 4th.
 
-```{r}
-covid_within_max %>%
-  filter(rel_duration < covid_flat_threshold) %>%
+```{r flu_peak_end}
+flu_within_max %>%
+  filter(rel_duration < flu_flat_threshold) %>%
   pull(last_above) %>%
   summary()
 ```
@@ -271,9 +281,9 @@ Forecast dates: `r forecast_dates`
 Generally there are far too few to have a major effect (something like 2% of the scores).
 The standard deviation for a given forecaster is significantly larger than the actual mean, so we should avoid drawing too many conclusions from these overall scores.
 
-`mean_pop_norm_wis` and `mean_pop_norm_ae` are on a rate per 100,000.
+`pop_norm_wis` and `pop_norm_ae` are on a rate per 100,000.
 
-```{r, fig.height = 60, fig.width = 12, echo=FALSE}
+```{r flu_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
 # set the offset to be the minimum non-zero score
 flu_score_summary <- flu_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
@@ -301,8 +311,9 @@ flu_score_summary %>%
 
 #### Scores Aggregated By Phase
 Note that the standard deviation is frequently double the actual value, much like in the totally general case.
+Adding it to the plot here results in bands that wash out all variation between forecasters (which you can see by un-commenting the `geom_ribbon` line).
 
-```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+```{r plot_flu_by_phase, fig.height = 8, fig.width = 12, echo=FALSE}
 phase_scores <-
   flu_scores %>%
   left_join(flu_within_max, by = "geo_value") %>%
@@ -317,16 +328,19 @@ phase_scores <-
   )
 p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.3) +
   geom_point() +
   theme_bw() +
-  labs(x = "Phase", y = "Mean WIS") 
+  labs(x = "Phase", y = "Mean WIS")
 
 ggplotly(p)
 ```
 
 #### Scores Aggregated By Forecast Date
+Note that the standard deviation is frequently double the actual value, much like in the totally general case.
+Adding it to the plot here results in bands that wash out all variation between forecasters (which you can see by un-commenting the `geom_ribbon` line).
 
-```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+```{r plot_flu_by_forecast_date, fig.height = 8, fig.width = 12, echo=FALSE}
 agg_flu <- flu_scores %>%
   filter(forecast_date > "2024-10-01") %>%
   filter(forecast_date != as.Date("2025-01-25")) %>%
@@ -346,6 +360,7 @@ agg_flu <- flu_scores %>%
 # Plot the scores as lines across forecast_date
 p <- ggplot(agg_flu, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
   theme_bw() +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   labs(x = "Forecast Date", y = "Mean WIS")
@@ -355,7 +370,7 @@ ggplotly(p)
 
 #### Scores Aggregated By Ahead
 
-```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+```{r plot_flu_by_ahead, fig.height = 8, fig.width = 12, echo=FALSE}
 agg <- flu_scores %>%
   group_by(forecaster, ahead) %>%
   summarize(
@@ -386,7 +401,7 @@ Even after normalizing by population, there is a large variation in scale for th
 Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
 `climate_geo_agged` is flatter overall than `ens_ar_only`
 
-```{r, fig.height = 20, fig.width = 13, echo=FALSE}
+```{r flu_score_histogram, fig.height = 20, fig.width = 13, echo=FALSE}
 #, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
 flu_scores %>%
   left_join(flu_within_max, by = "geo_value") %>%
@@ -538,7 +553,7 @@ Forecast dates: `r forecast_dates`
 
 #### Scores Aggregated By Forecaster
 
-```{r, fig.height = 60, fig.width = 12, echo=FALSE}
+```{r covid_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
 covid_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster) %>%
@@ -563,18 +578,20 @@ covid_scores %>%
 
 #### Scores Aggregated By Phase
 
-```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+```{r plot_covid_by_phase, fig.height = 8, fig.width = 12, echo=FALSE}
 phase_scores <- covid_scores %>%
   left_join(covid_within_max, by = "geo_value") %>%
   mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   group_by(forecaster, phase) %>%
   summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
     across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
     n = n(),
     .groups = "drop"
   )
 p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.1) +
   geom_point() +
   theme_bw() +
   labs(x = "Phase", y = "Mean WIS")
@@ -586,18 +603,20 @@ ggplotly(p)
 
 Since the `flat` classification is somewhat ambiguous, we should also bin everything as we otherwise would.
 
-```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+```{r plot_covid_by_phase_no_flat, fig.height = 8, fig.width = 12, echo=FALSE}
 phase_scores <- covid_scores %>%
   left_join(covid_within_max, by = "geo_value") %>%
   mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 1)) %>%
   group_by(forecaster, phase) %>%
   summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
     across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
     n = n(),
     .groups = "drop"
   )
 p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.1) +
   geom_point() +
   theme_bw() +
   labs(x = "Phase", y = "Mean WIS")
@@ -608,12 +627,13 @@ ggplotly(p)
 #### Scores Aggregated By Forecast Date
 
 ```{r, fig.height = 8, fig.width = 12, echo=FALSE}
-agg_flu <- covid_scores %>%
+agg_covid <- covid_scores %>%
   filter(forecast_date > "2024-10-01") %>%
   filter(forecast_date != as.Date("2025-01-25")) %>%
   group_by(forecaster, forecast_date) %>%
   summarize(
     mean_wis = round(Mean(wis), 2),
+    wis_sd = round(sd(wis), 2),
     geomean_wis = round(GeoMean(wis), 2),
     mean_ae = round(Mean(ae_median), 2),
     geomean_ae = round(GeoMean(ae_median), 2),
@@ -621,8 +641,9 @@ agg_flu <- covid_scores %>%
   )
 
 # Plot the scores as lines across forecast_date
-p <- ggplot(agg_flu, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
+p <- ggplot(agg_covid, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
   theme_bw() +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   labs(x = "Forecast Date", y = "Mean WIS")
@@ -637,6 +658,7 @@ agg <- covid_scores %>%
   group_by(forecaster, ahead) %>%
   summarize(
     mean_wis = round(Mean(wis), 2),
+    wis_sd = round(sd(wis), 2),
     geomean_wis = round(GeoMean(wis), 2),
     mean_ae = round(Mean(ae_median), 2),
     geomean_ae = round(GeoMean(ae_median), 2),
@@ -646,6 +668,7 @@ agg <- covid_scores %>%
 # Plot the scores as lines across forecast_date
 p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
   geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
   theme_bw() +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   labs(x = "Ahead", y = "Mean WIS")

From cd70e804109fe4a2eba048b446a599b17775b106 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 28 Apr 2025 17:38:41 -0500
Subject: [PATCH 34/62] including forecasts, more text

---
 scripts/reports/season_summary_2025.Rmd | 188 +++++++++++++++++++++---
 1 file changed, 170 insertions(+), 18 deletions(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 82d26f0e..1b87e183 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -37,11 +37,13 @@ source(here::here("R/load_all.R"))
 ```
 
 ```{r setup, include=FALSE}
+library(scales)
 library(DT)
 
 # Define aggregation functions
 Mean <- function(x) mean(x, na.rm = TRUE)
 GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
+our_forecasters <- c("linear", "windowed_seasonal", "windowed_seasonal_nssp", "climate_base", "climate_geo_agged", "climate_linear", "ensemble_windowed", "retro_submission", "CMU-TimeSeries")
 flu_scores <-
   qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores")) %>%
   mutate(forecaster = case_match(
@@ -50,7 +52,8 @@ flu_scores <-
     "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
     "ens_ar_only" ~ "ensemble_windowed",
     .default = forecaster
-    ))
+    )) %>%
+  mutate(our_forecaster = forecaster %in% our_forecasters)
 flu_forecasts <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "joined_forecasts_and_ensembles"))
 flu_forecasts$forecaster %<>% case_match(
     "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
@@ -67,6 +70,11 @@ covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
     "ens_ar_only" ~ "ensemble_windowed",
     .default = forecaster
     ))
+
+forecast_week <- flu_scores$forecast_date %>% unique()
+forecast_weeks_to_plot <- c(seq.Date(min(forecast_week), max(forecast_week), by = 3*7), as.Date("2025-01-18"))
+forecast_weeks_to_plot %in% (flu_scores$forecast_date %>% unique())
+forecast_weeks_to_plot %in% (covid_scores$forecast_date %>% unique())
 ```
 
 # Models used
@@ -142,7 +150,7 @@ weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead
           as.Date("2025-01-01"), "-", "-",
           as.Date("2025-01-08"), "`linear` no longer has an intercept", "same change",
           as.Date("2025-01-15"), "`windowed_seasonal_nssp` introduced to ensemble", "-",
-          as.Date("2025-01-22"), "-", "`windowed_seasonal_nssp` introduced to ensemble",
+          as.Date("2025-01-22"), "data outage- no forecast", "`windowed_seasonal_nssp` introduced to ensemble, also missing data",
           as.Date("2025-01-29"), "-", "-",
           as.Date("2025-02-05"), "-", "`windowed_seasonal` introduced to ensemble",
           as.Date("2025-02-12"), "-", "-",
@@ -165,6 +173,7 @@ weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead
 <!-- NOTE: to run this, you must run `make prod-flu` and `make prod-covid` successfully, as it will read the targets directly from the store. -->
 
 In addition to the plots below, it is worth keeping in mind the all model comparisons from [flu eval dashboard](https://reichlab.io/flusight-dashboard/eval.html) and [covid](https://reichlab.io/covidhub-dashboard/eval.html).
+We've included the best models there below as well.
 
 For Flu, the best wis-scoring model there is `PSI-PROF` with a mean WIS of 128.6 vs the ensemble's 140.8 and `CMU-TimeSeries`'s 139.7[^1].
 The best MAE-scoring model there is `CEPH-Rtrend_fluH`, with a mean MAE of 187.4 vs the ensemble's 196.6 and `CMU-TimeSeries`'s 197.8.
@@ -284,7 +293,6 @@ The standard deviation for a given forecaster is significantly larger than the a
 `pop_norm_wis` and `pop_norm_ae` are on a rate per 100,000.
 
 ```{r flu_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
-# set the offset to be the minimum non-zero score
 flu_score_summary <- flu_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster) %>%
@@ -301,10 +309,14 @@ flu_score_summary <- flu_scores %>%
     pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
     geo_ae = round(GeoMean(ae_median, min_ae), 2),
     #nAEzero = sum(ae_median < 1e-5),
+    mean_cov_50 = round(Mean(interval_coverage_50), 2),
     mean_cov_90 = round(Mean(interval_coverage_90), 2),
     n = n()
   ) %>%
-  rename(id = forecaster)
+  rename(id = forecaster) %>%
+  arrange(mean_wis)
+wis_score_order <- flu_score_summary %>% pull(id)
+pop_score_order <- flu_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
 flu_score_summary %>%
   datatable()
 ```
@@ -330,6 +342,7 @@ p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = fo
   geom_line() +
   #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.3) +
   geom_point() +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=20), labels = scales::comma) +
   theme_bw() +
   labs(x = "Phase", y = "Mean WIS")
 
@@ -390,7 +403,40 @@ p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
 ggplotly(p)
 ```
 
-#### Score histograms
+#### Scores Aggregated By State
+These give population normalized WIS for each state and forecaster.
+Since there seems to be a nonlinear effect of population on the target variable, we
+include color giving population on a log scale.
+
+`pr` is unsurprisingly quite high for most forecasters.
+
+```{r flu_plot_geo_agged, fig.height = 8, fig.width = 12}
+scored_geo <- flu_scores %>%
+  group_by(forecaster, geo_value) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  ) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  ungroup()
+pop_score_order <- flu_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+scored_geo %>%
+  mutate(forecaster = factor(forecaster, levels = pop_score_order)) %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+```
+
+#### Score Histograms
 
 The standard deviation is far too large to actually include it in any of the previous graphs and tables.
 It is routinely as large as the mean value itself.
@@ -420,24 +466,82 @@ flu_scores %>%
   scale_color_viridis_d()
 ```
 
+#### Sample Forecasts
+
+We're plotting the 80% CI along with the median.
+The locations were chosen based on the scores to have a sample of large and small states with large and small (population normalized) WIS.
+We've scaled so everything is in rates per 100k so that it's easier to actually compare; even so the peak value varies drastically.
+Forecasters we've produced are blue, while forecasters from other teams are red.
+They are ordered by `mean_wis` score, best to worst.
+
+```{r flu_plot_sample_forecast, fig.height = 20, fig.width = 13, echo=FALSE}
+plot_geos <- c("ca", "dc", "pa", "hi", "tx")
+filtered_flu_forecasts <- flu_forecasts %>%
+  filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
+
+
+flu_forecasts <- filtered_flu_forecasts %>%
+  filter(forecast_date %in% forecast_weeks_to_plot) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  mutate(our_forecaster = forecaster %in% our_forecasters) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value * 1e5/ pop) %>%
+  pivot_wider(names_from = quantile, values_from = value) %>%
+  ggplot(aes(x = target_end_date)) +
+  geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`, color = our_forecaster, fill = our_forecaster, group = forecast_date), alpha = 0.5) +
+  geom_line(aes(y = `0.5`, color = our_forecaster, group = forecast_date)) +
+  geom_line(
+    data = flu_current %>%
+      filter(time_value > "2024-11-01", geo_value %in% plot_geos) %>%
+      left_join(state_census, by = join_by(geo_value == abbr)) %>%
+      mutate(value = value * 1e5/ pop),
+    aes(x = time_value, y = value)) +
+  scale_color_brewer(palette = "Set2") +
+  scale_fill_brewer(palette = "Set2") +
+  facet_grid(forecaster ~ geo_value, scale = "free") +
+  theme(legend.position = "none")
+flu_forecasts
+
+ggplotly(flu_forecasts)
+```
+
 ### Results
 
-Either by `mean_wis` or `mean_ae`, `CMU-TimeSeries`, `FluSight-ensemble`, and `windowed_seasonal_nssp` all have similarly good performance, with `ens_ar_only`, `retro_submission`, and `windowed_seasonal` all within 10 cases of the best performing.
-On a population normalized basis, `windowed_seasonal_nssp` stands out somewhat, but not in an absolutely stunning way.
+Before digging into the score results, take a look at the Score histograms tab; all of these forecasters have a *wide* variation in actual score values, with the standard deviation frequently larger than the mean value.
+This means the mean scores below are going to be pretty sensitive to the large outliers.
+
+#### Overall scores
 
-Using `mean_cov_90`, most of these models have quantiles that are too narrow; only `climate_geo_agged` actually has a 90% coverage, and it primarily achieves this through having quantiles that are several times larger than the largest value for that geo.
-The best performing models range from 76% to 83%.
+Our best forecasters cluster around a mean wis between 124 and 132, with our actually submitted `CMU-TimeSeries` performing the best.
+The absolute best forecaster this season was `Psi-PROF`, with a `mean_wis` of 113, which is within 10% of `CMU-TimeSeries`.
+While we couldn't have improved our `mean_wis`, we could have improved our population normalized mean wis `pop_norm_wis` by using `windowed_seasonal_nssp` by ~10% as well.
 
-Breaking up the scoring by phase, those forecasters all cluster together, with only `FluSight-baseline`, `linear`, and both `climate` only models having appreciably worse scores.
+Absolute error before and after population normalizing has a similar story, though the order for models from other labs changes, and our relative score improves.
+
+Using `mean_cov_50`, our 50% coverage is generally good, with most models within 10 percentage points of 50%.
+There's quite a bit of variability in other's models, with most models having too narrow of 50% bands.
+
+Using `mean_cov_90`, most of these models have quantiles that are too narrow, though `CMU-TimeSeries` is within 5 percentage points; `climate_geo_agged` has a 90% coverage, but it is otherwise quite inaccurate.
+`retro_submission` does surprisingly well on this metric by hitting 90% exactly.
+
+#### Phase
+
+Breaking up the scoring by phase, most forecasters cluster together pretty tightly, with only `FluSight-baseline`, `linear`, and both `climate` only models having appreciably worse scores.
+`climate_linear` is competitive in the `increasing` phase, but once we hit either the `peak` or `decreasing` it is less accurate; likely this is because this season was both higher and had a longer duration than previous ones.
 All of the models do ~twice as worse at the peak as during either the `increasing` or `decreasing` phases, with most models doing marginally better during the `decreasing` phase than the `increasing` phase.
 It is worth noting that phase doesn't correspond to just grouping the dates, because different geographies enter a new phase at different times.
 
-Factoring by ahead, the models that include an AR component generally degrade with ahead less badly. 
-Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase (after the -1 ahead where it typically has exact data).
+#### Ahead
 
+Factoring by ahead, the models that include an AR component generally degrade with ahead less badly. 
+Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase.
+Most of the advantage of `PSI-PROF` and `FluSight-lop_norm` comes from having more accurate 2 and 3 week aheads.
 
-That said, take a look at the Score histograms tab; all of these forecasters have a *wide* variation in actual score values, with the standard deviation frequently larger than the mean value.
+#### Sample forecasts
 
+Looking at a couple of forecasts, it primarily looks like our models were off because they were predicting the downturn far too early.
+Not as badly as our [pure AR forecasters](decreasing_forecasters.html) were however.
+The well performing models from other teams also had this behavior this year.
 
 
 
@@ -554,7 +658,7 @@ Forecast dates: `r forecast_dates`
 #### Scores Aggregated By Forecaster
 
 ```{r covid_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
-covid_scores %>%
+covid_score_summary <- covid_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   group_by(forecaster) %>%
   mutate(
@@ -571,9 +675,9 @@ covid_scores %>%
     mean_coverage_90 = round(Mean(interval_coverage_90), 2),
     n = n()
   ) %>% 
-  arrange(mean_wis) %>%
-  rename(id = forecaster) %>%
-  datatable()
+  arrange(mean_wis)
+
+datatable(covid_score_summary)
 ```
 
 #### Scores Aggregated By Phase
@@ -676,6 +780,54 @@ ggplotly(p)
 ```
 
 
+#### Scores Aggregated By State
+
+These give population normalized WIS for each state and forecaster.
+Since there seems to be a nonlinear effect of population on the target variable, we
+include color giving population on a log scale.
+They are ordered by their average population normalized WIS.
+We have a separate plot for `climate_geo_agged` because it does so poorly on the small states that it washes out our ability to compare across states and forecasters (note that max is an order of magnitude higher, 2.4 vs 26).
+
+If you want to see a non-population scaled version of this, switch `y = pop_norm_wis` to `y = mean_wis` below, and comment out the `climate_geo_agged` filter.
+
+```{r covid_plot_geo_agged, fig.height = 8, fig.width = 12}
+pop_wis_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(forecaster)
+score_geo <- covid_scores %>%
+  group_by(forecaster, geo_value) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  ) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  ungroup() %>%
+  mutate(forecaster = factor(forecaster, levels = pop_wis_order))
+score_geo %>% filter(mean_wis > y_limit) %>% arrange(mean_wis)
+score_geo %>%
+  filter(forecaster != "climate_geo_agged") %>%
+  #mutate(mean_wis = pmin(mean_wis, y_limit)) %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+```
+
+```{r}
+score_geo %>%
+  filter(forecaster == "climate_geo_agged") %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+```
 #### Score histograms
 
 The standard deviation is far too large to actually include it in any of the previous graphs and tables meaningfully.
@@ -708,7 +860,7 @@ covid_scores %>%
 
 ### Results
 
-
+Some words on covid scores
 
 # Revision behavior and data substitution
 

From 110c2f52386b15373bafcb3dfd7b53f7911ef0a2 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 28 Apr 2025 18:07:24 -0500
Subject: [PATCH 35/62] include first_day_wrong, covid forecasts

---
 scripts/reports/first_day_wrong.Rmd     | 108 ++++++++++++++++++++++++
 scripts/reports/season_summary_2025.Rmd |  56 +++++++++++-
 2 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 scripts/reports/first_day_wrong.Rmd

diff --git a/scripts/reports/first_day_wrong.Rmd b/scripts/reports/first_day_wrong.Rmd
new file mode 100644
index 00000000..6daec86e
--- /dev/null
+++ b/scripts/reports/first_day_wrong.Rmd
@@ -0,0 +1,108 @@
+---
+title: "Why are the scores so bad on the first day?"
+date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+output:
+  html_document:
+    code_folding: hide
+editor_options:
+  chunk_output_type: console
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+$$\\[.4in]$$
+
+```{r echo=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+knitr::opts_template$set(figure1 = list(fig.height = 4, fig.width=4))
+source(here::here("R/load_all.R"))
+```
+
+The scores on the first day are all unusually bad, bad enough we initially thought it was a bug.
+This notebook is a demonstration that these forecasts are somewhat reasonable given the context, and separated out because it would otherwise be distracting.
+The primary reason is that it has an unusual amount of revision.
+First, getting the necessary archive, forecasts, and scores, and plotting the versions around the first forecast day `2024-11-20` (the week of `2024-11-23`):
+
+```{r, fig.height = 15, fig.width = 15}
+covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
+covid_forecasts <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "joined_forecasts_and_ensembles"))
+covid_archive <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+text_size <- 6
+covid_archive$DT %>%
+  filter(time_value < as.Date("2024-11-23") + 4*7, time_value > "2024-09-01") %>%
+  as_epi_archive() %>%
+  autoplot(.versions = c(as.Date("2024-11-20"), covid_archive$versions_end)) + 
+  geom_vline(aes(xintercept = as.Date("2024-11-23")))
+```
+
+Most locations have a significantly different version on `2024-11-20`, some by as much as 4 times the final version.
+
+Building a function to plot the different forecasters
+```{r}
+plot_problem_day <- function(forecaster, text_size = 6) {
+  covid_scores %>% filter(forecast_date == "2024-11-23") %>% arrange(wis) 
+covid_forecasts %>% filter(forecaster =="climate_base") %>% filter(forecast_date == "2024-11-23")
+  cmu_timeseries_fc <- covid_forecasts %>% filter(forecaster ==.env$forecaster) %>% filter(forecast_date == "2024-11-23")
+  cmu_timeseries_wide <- cmu_timeseries_fc %>%
+    pivot_wider(names_from = "quantile", values_from = "value")
+  covid_archive$DT %>% filter(time_value < as.Date("2024-11-23") + 4*7, time_value > "2024-09-01") %>% as_epi_archive() %>% autoplot() +
+    geom_vline(aes(xintercept = as.Date("2024-11-23"))) +
+    geom_ribbon(data = cmu_timeseries_wide, aes(x = target_end_date, ymin = `0.1`, ymax = `0.9`), alpha = 0.3) +
+    geom_ribbon(data = cmu_timeseries_wide, aes(x = target_end_date, ymin = `0.25`, ymax = `0.75`), alpha = 0.3) +
+    geom_line(data = cmu_timeseries_wide, aes(x = target_end_date, y = `0.5`)) +
+    facet_wrap(~geo_value, scale = "free")
+}
+```
+
+### Windowed seasonal
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("windowed_seasonal")
+```
+
+which is extrapolating out in a straight line from trends that are reporting artifacts.
+
+### Covidhub-baseline
+Covidhub baseline forms a good sanity check, since it is forecasting out from the versioned data (which explains why it is ~as  bad as `windowed_seasonal_extra_sources`)
+
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("CovidHub-baseline")
+```
+
+### Linear
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("linear")
+```
+
+which is extrapolating out in a straight line from trends that are reporting artifacts.
+
+### Climate
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("climate_base")
+```
+
+This one is wrong simply because this season was unusually low at this point.
+
+### CMU-TimeSeries
+This is a bit odd, since the forecaster we were using at the time was a simple average of the linear and climate forecasters, and so is off because of a combination of the reasons the linear and climate forecasts are off.
+
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("CMU-TimeSeries")
+```
diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 1b87e183..e2aa2422 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -70,6 +70,13 @@ covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
     "ens_ar_only" ~ "ensemble_windowed",
     .default = forecaster
     ))
+covid_forecasts <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "joined_forecasts_and_ensembles")) %>% ungroup()
+covid_forecasts$forecaster %<>% case_match(
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = covid_forecasts$forecaster
+    )
 
 forecast_week <- flu_scores$forecast_date %>% unique()
 forecast_weeks_to_plot <- c(seq.Date(min(forecast_week), max(forecast_week), by = 3*7), as.Date("2025-01-18"))
@@ -480,7 +487,7 @@ filtered_flu_forecasts <- flu_forecasts %>%
   filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
 
 
-flu_forecasts <- filtered_flu_forecasts %>%
+flu_forecast_plt <- filtered_flu_forecasts %>%
   filter(forecast_date %in% forecast_weeks_to_plot) %>%
   mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
   mutate(our_forecaster = forecaster %in% our_forecasters) %>%
@@ -500,9 +507,8 @@ flu_forecasts <- filtered_flu_forecasts %>%
   scale_fill_brewer(palette = "Set2") +
   facet_grid(forecaster ~ geo_value, scale = "free") +
   theme(legend.position = "none")
-flu_forecasts
 
-ggplotly(flu_forecasts)
+ggplotly(flu_forecast_plt)
 ```
 
 ### Results
@@ -677,6 +683,8 @@ covid_score_summary <- covid_scores %>%
   ) %>% 
   arrange(mean_wis)
 
+wis_score_order <- covid_score_summary %>% pull(forecaster)
+pop_score_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(forecaster)
 datatable(covid_score_summary)
 ```
 
@@ -807,7 +815,7 @@ score_geo <- covid_scores %>%
   ungroup() %>%
   mutate(forecaster = factor(forecaster, levels = pop_wis_order))
 score_geo %>% filter(mean_wis > y_limit) %>% arrange(mean_wis)
-score_geo %>%
+geo_plot <- score_geo %>%
   filter(forecaster != "climate_geo_agged") %>%
   #mutate(mean_wis = pmin(mean_wis, y_limit)) %>%
   ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
@@ -816,6 +824,8 @@ score_geo %>%
   scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
   scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+
+ggplotly(geo_plot)
 ```
 
 ```{r}
@@ -858,6 +868,44 @@ covid_scores %>%
   scale_color_viridis_d()
 ```
 
+#### Sample Forecasts
+
+We're plotting the 80% CI along with the median.
+The locations were chosen based on the scores to have a sample of large and small states with large and small (population normalized) WIS.
+We've scaled so everything is in rates per 100k so that it's easier to actually compare; even so the peak value varies drastically.
+Forecasters we've produced are blue, while forecasters from other teams are red.
+They are ordered by `mean_wis` score, best to worst.
+
+```{r covid_plot_sample_forecast, fig.height = 20, fig.width = 13, echo=FALSE}
+plot_geos <- c("ca", "dc", "pa", "hi", "tx")
+plot_geos <- c("ca", "de", "pa", "wy")
+filtered_covid_forecasts <- covid_forecasts %>%
+  ungroup() %>%
+  filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
+
+covid_forecast_plt <- filtered_covid_forecasts %>%
+  filter(forecast_date %in% forecast_weeks_to_plot) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  mutate(our_forecaster = forecaster %in% our_forecasters) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value * 1e5/ pop) %>%
+  pivot_wider(names_from = quantile, values_from = value) %>%
+  ggplot(aes(x = target_end_date)) +
+  geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`, color = our_forecaster, fill = our_forecaster, group = forecast_date), alpha = 0.5) +
+  geom_line(aes(y = `0.5`, color = our_forecaster, group = forecast_date)) +
+  geom_line(
+    data = covid_current %>%
+      filter(time_value > "2024-11-01", geo_value %in% plot_geos) %>%
+      left_join(state_census, by = join_by(geo_value == abbr)) %>%
+      mutate(value = value * 1e5/ pop),
+    aes(x = time_value, y = value)) +
+  scale_color_brewer(palette = "Set2") +
+  scale_fill_brewer(palette = "Set2") +
+  facet_grid(forecaster ~ geo_value, scale = "free") +
+  theme(legend.position = "none")
+
+ggplotly(covid_forecast_plt)
+```
 ### Results
 
 Some words on covid scores

From 49f89216821ef4b5388a30b614330d3fbe7b9a6d Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 29 Apr 2025 13:32:31 -0500
Subject: [PATCH 36/62] order via factor, covid fcsts, ggplotly

---
 scripts/reports/season_summary_2025.Rmd | 101 +++++++++++++++++++-----
 1 file changed, 83 insertions(+), 18 deletions(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index e2aa2422..f967858d 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -79,7 +79,7 @@ covid_forecasts$forecaster %<>% case_match(
     )
 
 forecast_week <- flu_scores$forecast_date %>% unique()
-forecast_weeks_to_plot <- c(seq.Date(min(forecast_week), max(forecast_week), by = 3*7), as.Date("2025-01-18"))
+forecast_weeks_to_plot <- c(seq.Date(min(forecast_week), max(forecast_week), by = 3*7), as.Date("2025-01-18"), as.Date("2025-02-01"))
 forecast_weeks_to_plot %in% (flu_scores$forecast_date %>% unique())
 forecast_weeks_to_plot %in% (covid_scores$forecast_date %>% unique())
 ```
@@ -221,6 +221,7 @@ flu_archive <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive
 flu_current <- flu_archive %>%
   epix_as_of_current() %>%
   filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+flu_max <- flu_current %>% group_by(geo_value) %>% summarize(max_value = max(value))
 compute_peak_season <- function(data_current, threshold = 0.5, start_of_year = as.Date("2024-11-01")) {
   season_length <- data_current %>% pull(time_value) %>% max() - start_of_year
   data_current %>%
@@ -430,29 +431,38 @@ scored_geo <- flu_scores %>%
     mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
   ) %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  left_join(flu_max, by = "geo_value") %>%
   ungroup()
 pop_score_order <- flu_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
-scored_geo %>%
+geo_plot <-
+  scored_geo %>%
   mutate(forecaster = factor(forecaster, levels = pop_score_order)) %>%
   ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
   geom_col() +
   facet_wrap(~forecaster) +
   scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
-  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  scale_fill_viridis_c(transform="log") +
   theme_bw() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+
+ggplotly(geo_plot)
 ```
 
 #### Score Histograms
 
 The standard deviation is far too large to actually include it in any of the previous graphs and tables.
-It is routinely as large as the mean value itself.
-To try to represent this, in this tab we have the histogram of the wis, split by phase and forecaster.
+It is routinely larger than the mean WIS.
+To try to represent this, in this tab we have the histogram of the WIS, split by phase and forecaster.
 Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
 Even after normalizing by population, there is a large variation in scale for the scores.
 
+The forecasters are arranged according to mean WIS.
 Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
-`climate_geo_agged` is flatter overall than `ens_ar_only`
+During the `peak`, very few forecasters actually have any results in the smallest bin; this implies that basically no forecasters were appreciably correct around the peak.
+
+In the `peak` and `decreasing` phases, the linear model simultaneously has a longer tail and a high degree of concentration otherwise, which implies it is both generally right, but catastrophically wrong when it's off.
+
+Comparing the `increasing` and `decreasing` phases across forecasters, `decreasing` tends to have a stronger concentration in the lowest two bins, but a much longer tail of large errors.
 
 ```{r flu_score_histogram, fig.height = 20, fig.width = 13, echo=FALSE}
 #, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
@@ -461,6 +471,7 @@ flu_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   mutate(wis = wis * 1e5/pop) %>%
   mutate(pop = factor(pop)) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
   group_by(forecaster) %>%
   mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
@@ -481,6 +492,17 @@ We've scaled so everything is in rates per 100k so that it's easier to actually
 Forecasters we've produced are blue, while forecasters from other teams are red.
 They are ordered by `mean_wis` score, best to worst.
 
+```{r}
+tribble(
+  ~state, ~performance, ~population,
+  "ca", "~best", "large",
+  "dc", "~worst", "small",
+  "pa", "terrible", "large",
+  "hi", "~best", "small",
+  "tx", "good", "large"
+) %>% datatable()
+```
+
 ```{r flu_plot_sample_forecast, fig.height = 20, fig.width = 13, echo=FALSE}
 plot_geos <- c("ca", "dc", "pa", "hi", "tx")
 filtered_flu_forecasts <- flu_forecasts %>%
@@ -490,7 +512,7 @@ filtered_flu_forecasts <- flu_forecasts %>%
 flu_forecast_plt <- filtered_flu_forecasts %>%
   filter(forecast_date %in% forecast_weeks_to_plot) %>%
   mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
-  mutate(our_forecaster = forecaster %in% our_forecasters) %>%
+  mutate(our_forecaster = factor(forecaster %in% our_forecasters, levels = c(TRUE, FALSE))) %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   mutate(value = value * 1e5/ pop) %>%
   pivot_wider(names_from = quantile, values_from = value) %>%
@@ -577,6 +599,7 @@ covid_archive <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_arc
 covid_current <- covid_archive %>%
   epix_as_of_current() %>%
   filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+covid_max <- covid_current %>% group_by(geo_value) %>% summarize(max_value = max(value))
 covid_within_max <- compute_peak_season(covid_current)
 ```
 
@@ -678,6 +701,7 @@ covid_score_summary <- covid_scores %>%
     mean_ae = round(Mean(ae_median), 2),
     pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
     geomean_ae = round(GeoMean(ae_median, min_ae), 2),
+    mean_coverage_50 = round(Mean(interval_coverage_50), 2),
     mean_coverage_90 = round(Mean(interval_coverage_90), 2),
     n = n()
   ) %>% 
@@ -814,7 +838,7 @@ score_geo <- covid_scores %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   ungroup() %>%
   mutate(forecaster = factor(forecaster, levels = pop_wis_order))
-score_geo %>% filter(mean_wis > y_limit) %>% arrange(mean_wis)
+
 geo_plot <- score_geo %>%
   filter(forecaster != "climate_geo_agged") %>%
   #mutate(mean_wis = pmin(mean_wis, y_limit)) %>%
@@ -822,7 +846,7 @@ geo_plot <- score_geo %>%
   geom_col() +
   facet_wrap(~forecaster) +
   scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
-  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  scale_fill_viridis_c(transform="log") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
 
 ggplotly(geo_plot)
@@ -838,7 +862,8 @@ score_geo %>%
   scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
 ```
-#### Score histograms
+
+#### Score Histograms
 
 The standard deviation is far too large to actually include it in any of the previous graphs and tables meaningfully.
 It is routinely larger than the wis value itself.
@@ -846,16 +871,23 @@ Like with Flu, in this tab we have the histogram of the wis, split by phase and
 Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
 Even after normalizing by population, there is a variation in scale for the scores.
 
+The forecasters are ordered according to mean WIS.
 Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
-`climate_geo_agged` is flatter overall than `ens_ar_only`
 
-```{r, fig.height = 20, fig.width = 13, echo=FALSE}
-#, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
+Like in Flu, in the `peak` phase, basically all forecasters are basically missing the first bin, so no forecasters are right during the peak.
+Unlike in Flu, the `flat` phase exists, and roughly resembles `decreasing` in distribution.
+`increasing` is overall a much smaller proportion of all samples.
+
+`climate_base` is the closest any of these scores have come to normally distributed.
+`climate_geo_agged` is particularly bad for Covid.
+
+```{r, fig.height = 23, fig.width = 13}
 covid_scores %>%
   left_join(covid_within_max, by = "geo_value") %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   mutate(wis = wis * 1e5/pop) %>%
   mutate(pop = factor(pop)) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
   group_by(forecaster) %>%
   mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
@@ -876,9 +908,8 @@ We've scaled so everything is in rates per 100k so that it's easier to actually
 Forecasters we've produced are blue, while forecasters from other teams are red.
 They are ordered by `mean_wis` score, best to worst.
 
-```{r covid_plot_sample_forecast, fig.height = 20, fig.width = 13, echo=FALSE}
-plot_geos <- c("ca", "dc", "pa", "hi", "tx")
-plot_geos <- c("ca", "de", "pa", "wy")
+```{r covid_plot_sample_forecast, fig.height = 23, fig.width = 13, echo=FALSE}
+plot_geos <- c("ca", "de", "pa", "nh", "tx")
 filtered_covid_forecasts <- covid_forecasts %>%
   ungroup() %>%
   filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
@@ -886,7 +917,7 @@ filtered_covid_forecasts <- covid_forecasts %>%
 covid_forecast_plt <- filtered_covid_forecasts %>%
   filter(forecast_date %in% forecast_weeks_to_plot) %>%
   mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
-  mutate(our_forecaster = forecaster %in% our_forecasters) %>%
+  mutate(our_forecaster = factor(forecaster %in% our_forecasters, levels = c(TRUE, FALSE))) %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   mutate(value = value * 1e5/ pop) %>%
   pivot_wider(names_from = quantile, values_from = value) %>%
@@ -908,7 +939,41 @@ ggplotly(covid_forecast_plt)
 ```
 ### Results
 
-Some words on covid scores
+`windowed_seasonal_nssp` is a clear winner regardless of the metric used.
+`ensemble_windowed` is nearly as good, but since it is effectively averaging `windowed_seasonal_nssp` with `windowed_seasonal` and losing accuracy as a result, it is hardly worth it.
+
+The pure climate models were substantially worse for covid than for flu, at ~4.6x the best model, rather than ~2x.
+Given the unusual nature of the season, this is somewhat unsurprising.
+
+To some degree this explains the poor performance of `CMU-TimeSeries`.
+You can see this by looking at the "Scores Aggregated By Forecast Date" tab, where the first 3 weeks of `CMU-TimeSeries` are significantly worse than `climate_linear`, let alone the ensemble or our best models.
+
+#### Aggregated by phase
+
+There are two tabs dedicated to this, one with and one without a separate `flat` phase, which labels an entire state as `flat` if the duration of the `peak` is too long.
+Either way, the general shape is similar to Flu, with `increasing` scores lower than `peak` scores, but higher than `decreasing` scores.
+All of the phases are closer together than they were in the case of Flu, with the best `peak` phase forecaster nearly better than the worst `increasing` phase forecaster.
+`flat` roughly resembles increasing.
+Even disregarding the climate models, the distribution within a phase is wider than it was in the case of Flu.
+`windowed_seasonal_nssp` particularly shines during the `peak` and to some degree the `decreasing` phases.
+
+#### Aggregated by ahead
+
+Nothing terribly surprising here, most models are ~linear in score at increasing ahead.
+`windowed_seasonal_nssp` is the exception, which does comparatively worse at further aheads.
+
+#### Aggregated by State
+
+Across all forecasters, `wy` is a particularly difficult location to forecast, while `ca` is particularly easy.
+Scores don't seem to correlate particularly well with the population of the state.
+The variation in state scores for other group's forecasters is fairly similar to our non-climate forecasters.
+Both climate forecasters have a different distribution of which states are correct and which are wrong, and differ greatly from each-other.
+
+#### Sample Forecasts
+
+The always decreasing problem is definitely not present in these forecasts.
+If anything, our best forecasts are *too* eager to predict an increasing value, e.g. in `tx` and `ca`.
+Several of our worse forecasts are clearly caused by revision behavior.
 
 # Revision behavior and data substitution
 

From c2ef046de47fed2d3ab8e2fafcfba838d5658442 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 14:41:53 -0700
Subject: [PATCH 37/62] doc: season summary lint and covid updates

---
 scripts/reports/season_summary_2025.Rmd | 74 ++++++++++++-------------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index f967858d..11e15d27 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -86,15 +86,15 @@ forecast_weeks_to_plot %in% (covid_scores$forecast_date %>% unique())
 
 # Models used
 One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
-For both flu and covid they are either unusually large or unusually small, and don't warrant inclusion.
+For both flu and covid these seasons are either unusually large or unusually small, and don't warrant inclusion.
 We can split the models and ensembles into 3 categories: the ad-hoc models that we created in response to the actual data that we saw, the AR models that we had been backtesting, and the ensembles.
 
 ### The "ad-hoc" models
 
-- `climate_base` uses a 7 week window around the target and forecast date to establish quantiles. 
-  `climate_base` does this separately for each geo
+- `climate_base` uses a 7 week window around the target and forecast date to establish quantiles.
+  `climate_base` does this separately for each geo.
 - `climate_geo_agged` on the other hand converts to rates, pools all geos, computes quantiles using similar time windows, and then converts back to counts.
-  There is effectively only one prediction, scaled to fit each geo. 
+  There is effectively only one prediction, scaled to fit each geo.
 - `linear` does a linear extrapolation of the last 4 weeks of data on a rates scale.
   Initially it had an intercept, but this was removed when it caused the model to not reproduce the -1 ahead data exactly.
   This change was made on Jan 8th, in the commit with hash 5f7892b.
@@ -127,9 +127,9 @@ weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead
 
 - `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from an 8 week window from each year.
   It does quartic root scaling along with quantile and median whitening.
+  In addition to dropping the first 2 seasons, the windowed models drop the summers for the purposes of determining whitening behavior.
   For flu, this augments with ili and flusurv (so they are added as additional rows, with their own scaling/centering).
   Covid doesn't have a comparable dataset.
-  In addition to dropping the first 2 seasons, the windowed models drop the summers for the purposes of determining whitening behavior.
 - `windowed_seasonal_nssp` is like `windowed_seasonal`, but also has `nssp` as an exogenous component.
   Note that for flu, this effectively means throwing out the ili and flusurv data, since `nssp` is only defined recently.
   For covid, `windowed_seasonal_nssp` is effectively the same model, but with auxiliary data.
@@ -141,9 +141,9 @@ weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead
 - `retro_submission` is a retroactive recreation of `CMU-TimeSeries` using updated methods (`linear` always matching the most recent value, for example).
   The weights for the various models can be found in [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv).
   These can vary on a state by state basis.
-- `CMU-TimeSeries` is what we actually submitted. 
+- `CMU-TimeSeries` is what we actually submitted.
   This is a moving target that has changed a number of times. For a detailed list of the weights used, see [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv) for specific weights.
-  
+
   <details>
   <summary> A timeline of the changes to `CMU-timeseries` </summary>
   ```{r cmu_timeseries_timeline, echo=FALSE}
@@ -192,7 +192,7 @@ The best wis-scoring model is actually just the ensemble at 35.2, with the next-
 Coverage in covid is somewhat better, though a larger fraction of teams are within +/-10% of 95% coverage; we specifically got within 1%.
 Like with flu, there was systematic under-coverage though, so the models are also biased towards too small of intervals for the 95% band.
 The 50% coverage is likewise more accurate than for flu, with most forecasts within +/-10%.
-`CMU-TimeSeries` is at 52.7%, so slightly over. 
+`CMU-TimeSeries` is at 52.7%, so slightly over.
 Generally, more teams were under 50% coverage than over, so there is also a systemic bias towards under-coverage in covid.
 
 ## Flu Scores
@@ -266,7 +266,7 @@ flu_current %>%
 There is a wide variety of length for the peak by this definition, but it does seem to naturally reflect the difference in dynamics.
 `ok` is quite short for example, because it has a simple clean peak, whereas `or` has literally 2 peaks with the same height, so the entire interval between them is classified as peak.
 
-Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak. 
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak.
 First, for the start:
 
 ```{r flu_peak_start}
@@ -561,9 +561,8 @@ It is worth noting that phase doesn't correspond to just grouping the dates, bec
 
 #### Ahead
 
-Factoring by ahead, the models that include an AR component generally degrade with ahead less badly. 
-Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase.
-Most of the advantage of `PSI-PROF` and `FluSight-lop_norm` comes from having more accurate 2 and 3 week aheads.
+Factoring by ahead, the models that include an AR component generally degrade with ahead less badly.
+Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase (after the -1 ahead where it typically has exact data).
 
 #### Sample forecasts
 
@@ -575,13 +574,6 @@ The well performing models from other teams also had this behavior this year.
 
 ## Covid Scores
 
-Overall, the best covid forecaster is `windowed_seasonal_extra_sources`, which uses a window of data around the given time period
-
-One peculiar thing about Covid scoring: the first day has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab below).
-This mostly comes from the first week having larger revisions than normal.
-This is discussed in more detail in [this notebook](first_day_wrong.html).
-
-
 Before we get into the actual scores, we need to define how we go about creating 4 different phases.
 They are `increasing`, `peak`, `decreasing`, and `flat`.
 The last phase, `flat`, covers geos which didn't have an appreciable season for the year, which was relatively common for covid.
@@ -630,7 +622,7 @@ covid_current %>%
 Then we can see a very muted season in many locations, such as `ar` or `co`, and no season at all in some locations, such as `ak`.
 Others, such as `az`, `in`, or `mn` have a season that is on-par with historical ones.
 
-How to handle this? 
+How to handle this?
 One option is to include a separate phase for no season that applies to the entire `geo_value` if more than half of the `time_value`s are within 50% of the peak:
 
 ```{r}
@@ -661,23 +653,20 @@ Possible exceptions:
 There are several locations such as `al` and `ar` which don't have a peak so much as an elevated level for approximately the entire period.
 This is awkward to handle for this classification.
 
-Finally, like for Flu we should examine a summary of the start/end dates for the peak of the season.
-Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak. 
-First, for the start:
+Finally, like for Flu, we should examine a summary of the start/end dates for the peak of the covid season.
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak.
+First, for the start of start of the peak:
 
 ```{r}
 covid_within_max$first_above %>% summary()
 ```
 
-So the `increasing` phase ends at earliest on December 28st, on average on January 18th, and at the latest on April 19th.
-Which suggests
+Second, for the end of the peak:
 
 ```{r}
 covid_within_max$last_above %>% summary()
 ```
 
-Similarly, the `peak` phase ends at the earliest on the 11th of December, on average on the first of March, and at the latest on March 22nd.
-
 </details>
 
 ### Forecaster Scores for Covid:  {.tabset}
@@ -704,12 +693,10 @@ covid_score_summary <- covid_scores %>%
     mean_coverage_50 = round(Mean(interval_coverage_50), 2),
     mean_coverage_90 = round(Mean(interval_coverage_90), 2),
     n = n()
-  ) %>% 
-  arrange(mean_wis)
-
-wis_score_order <- covid_score_summary %>% pull(forecaster)
-pop_score_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(forecaster)
-datatable(covid_score_summary)
+  ) %>%
+  arrange(mean_wis) %>%
+  rename(id = forecaster) %>%
+  datatable()
 ```
 
 #### Scores Aggregated By Phase
@@ -937,10 +924,20 @@ covid_forecast_plt <- filtered_covid_forecasts %>%
 
 ggplotly(covid_forecast_plt)
 ```
+
 ### Results
 
-`windowed_seasonal_nssp` is a clear winner regardless of the metric used.
-`ensemble_windowed` is nearly as good, but since it is effectively averaging `windowed_seasonal_nssp` with `windowed_seasonal` and losing accuracy as a result, it is hardly worth it.
+One peculiar thing about Covid scoring: on the first forecast date, CMU-TimeSeries has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab below).
+There are two related issues here:
+- first, our initial model combined climate_base and linear, and the climate_base component was unusually bad early in the season, because this season started later than previous seasons,
+- second, the data had substantial revisions (this is discussed in detail in [this notebook](first_day_wrong.html)), however this effect is much smaller, since other forecasters had access to the same data.
+
+This mishap dragged the CMU-TimeSeries score down overall by quite a lot and its better performance later in the season is not enough to make up for it.
+
+Overall, the best covid forecaster is `windowed_seasonal_nssp`, outperforming `CovidHub-ensemble`, regardless of the metric used.
+This forecaster uses a window of data around the given time period, along with the NSSP exogenous features.
+`ensemble_windowed` is nearly as good, but since it is effectively averaging `windowed_seasonal_nssp` with `windowed_seasonal` and losing accuracy as a result, so it is hardly worth it.
+Given its simplicity, the `climate_linear` forecaster does quite well, though it's not as good as `windowed_seasonal_nssp`.
 
 The pure climate models were substantially worse for covid than for flu, at ~4.6x the best model, rather than ~2x.
 Given the unusual nature of the season, this is somewhat unsurprising.
@@ -975,11 +972,12 @@ The always decreasing problem is definitely not present in these forecasts.
 If anything, our best forecasts are *too* eager to predict an increasing value, e.g. in `tx` and `ca`.
 Several of our worse forecasts are clearly caused by revision behavior.
 
+
 # Revision behavior and data substitution
 
-This is covered in more detail in [revision_summary_report_2025](revision_summary_report_2025.html).
+This is covered in more detail in [revision_summary_report_2025](revision_summary_2025.html).
 NHSN has substantial under-reporting behavior that is fairly consistent for any single geo, though there a number of aberrant revisions, some of which change the entire trajectory for a couple of weeks.
-This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly. 
+This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly.
 Because of the speed in revision behavior, it matters only for prediction, rather than for correcting data for fitting the forecaster.
 We can probably improve our forecasts by incorporating revision behavior for both nhsn and nssp.
 
@@ -1109,5 +1107,5 @@ covid_gr %>%
       It's scored on N=4160 vs the local 3692, which probably comes down to negative aheads.
       Note that both "bests" in this paragraph are ignoring models which have far fewer submission values, since they're likely to be unrepresentative.
 
-[^2]: this is further off both in absolute and further yet in relative terms from our local scoring, which has `CMU-TimeSeries` at 46.32 rather than 44.8. 
+[^2]: this is further off both in absolute and further yet in relative terms from our local scoring, which has `CMU-TimeSeries` at 46.32 rather than 44.8.
       It's unclear why; there are 3952 samples scored on the remote vs 3692 locally, so ~300 scored there that we don't score where we apparently did better.

From 48c3b698d37b373fbc067c29f16d18c61f440083 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 14:42:04 -0700
Subject: [PATCH 38/62] doc: first day wrong lints

---
 scripts/reports/first_day_wrong.Rmd | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/reports/first_day_wrong.Rmd b/scripts/reports/first_day_wrong.Rmd
index 6daec86e..6a5fa417 100644
--- a/scripts/reports/first_day_wrong.Rmd
+++ b/scripts/reports/first_day_wrong.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "Why are the scores so bad on the first day?"
+title: "Why are the COVID 2024-2025 season scores so bad on the first day?"
 date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
 output:
   html_document:
@@ -36,9 +36,13 @@ knitr::opts_template$set(figure1 = list(fig.height = 4, fig.width=4))
 source(here::here("R/load_all.R"))
 ```
 
-The scores on the first day are all unusually bad, bad enough we initially thought it was a bug.
+The scores on the first forecast day for COVID are all unusually bad, bad enough we initially thought it was a bug.
 This notebook is a demonstration that these forecasts are somewhat reasonable given the context, and separated out because it would otherwise be distracting.
 The primary reason is that it has an unusual amount of revision.
+In comparison, the flu season delayed their initial forecast by a day, which allowed a new data revision to be used, which explains why flu doesn't have this problem.
+
+# Revision Behavior
+
 First, getting the necessary archive, forecasts, and scores, and plotting the versions around the first forecast day `2024-11-20` (the week of `2024-11-23`):
 
 ```{r, fig.height = 15, fig.width = 15}
@@ -49,16 +53,16 @@ text_size <- 6
 covid_archive$DT %>%
   filter(time_value < as.Date("2024-11-23") + 4*7, time_value > "2024-09-01") %>%
   as_epi_archive() %>%
-  autoplot(.versions = c(as.Date("2024-11-20"), covid_archive$versions_end)) + 
+  autoplot(.versions = c(as.Date("2024-11-20"), covid_archive$versions_end)) +
   geom_vline(aes(xintercept = as.Date("2024-11-23")))
 ```
 
 Most locations have a significantly different version on `2024-11-20`, some by as much as 4 times the final version.
 
-Building a function to plot the different forecasters
 ```{r}
+# Building a function to plot the different forecasters
 plot_problem_day <- function(forecaster, text_size = 6) {
-  covid_scores %>% filter(forecast_date == "2024-11-23") %>% arrange(wis) 
+  covid_scores %>% filter(forecast_date == "2024-11-23") %>% arrange(wis)
 covid_forecasts %>% filter(forecaster =="climate_base") %>% filter(forecast_date == "2024-11-23")
   cmu_timeseries_fc <- covid_forecasts %>% filter(forecaster ==.env$forecaster) %>% filter(forecast_date == "2024-11-23")
   cmu_timeseries_wide <- cmu_timeseries_fc %>%

From def1448e9a448e5010a724ba8e5e21ecc4cf9d78 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:19:52 -0700
Subject: [PATCH 39/62] doc: big update to template.md, describe our forecaster
 families

---
 R/utils.R           |  32 +------
 reports/template.md | 226 +++++++++++++++++++++++---------------------
 2 files changed, 119 insertions(+), 139 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 5c8b0b13..a73c3690 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -392,39 +392,9 @@ update_site <- function(sync_to_s3 = TRUE) {
     )
 
     # Insert into Production Reports section, skipping a line
-    prod_reports_index <- which(grepl("## Production Reports", report_md_content)) + 1
+    prod_reports_index <- which(grepl("## Weekly Fanplots 2024-2025 Season", report_md_content)) + 1
     report_md_content <- append(report_md_content, report_link, after = prod_reports_index)
   }
-  # add scoring notebooks if they exist
-  score_files <- dir_ls(reports_dir, regexp = ".*_backtesting_2024_2025_on_.*.html")
-  if (length(score_files) > 0) {
-    # a tibble of all score files, along with their generation date and disease
-    score_table <- tibble(
-      filename = score_files,
-      dates = str_match_all(filename, "[0-9]{4}-..-..")
-    ) %>%
-      unnest_wider(dates, names_sep = "_") %>%
-      rename(generation_date = dates_1) %>%
-      mutate(
-        generation_date = ymd(generation_date),
-        disease = str_match(filename, "flu|covid")
-      )
-    used_files <- score_table %>%
-      group_by(disease) %>%
-      slice_max(generation_date)
-    # iterating over the diseases
-    for (row_num in seq_along(used_files$filename)) {
-      file_name <- path_file(used_files$filename[[row_num]])
-      scoring_index <- which(grepl("### Scoring this season", report_md_content)) + 1
-      score_link <- sprintf(
-        "- [%s Scoring, Rendered %s](%s)",
-        str_to_title(used_files$disease[[row_num]]),
-        used_files$generation_date[[row_num]],
-        file_name
-      )
-      report_md_content <- append(report_md_content, score_link, after = scoring_index)
-    }
-  }
 
   # Write the updated content to report.md
   report_md_path <- path(reports_dir, "report.md")
diff --git a/reports/template.md b/reports/template.md
index 074fdaa3..967eb55e 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -2,154 +2,164 @@
 
 [GitHub Repo](https://github.com/cmu-delphi/explorationt-tooling/)
 
-## Production Reports
+## Overview
 
+- The weekly fanplots were used by the team for visual inspections of the forecasts.
+- The season reports provide a general analysis of the season's data and forecasts performance.
+- The backtesting reports provide a detailed analysis of a wide variety of forecasters' performance on the previous season's data.
+- A description of the forecaster families explored is provided at the bottom of the page.
 
-### Scoring this season
+## Weekly Fanplots 2024-2025 Season
 
 
-## Summary Reports
+## 2024-2025 Season Reports
 
-### 2025
-
-- [Season Summary](season_summary_2025.html) The other documents are also linked from here
+- [Season Summary](season_summary_2025.html) (the notebooks below are linked from here)
+  - [Covid's Problematic Initial Forecast](first_day_wrong.html)
+  - [NHSN Revision Behavior](revision_summary_2025.html)
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
-- [Revision Behavior](revision_summary_report_2025.html)
-- [Covid's problematic initial forecast](first_day_wrong.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
-### Flu
-
-All forecasters population scale their data, use geo pooling, and train using quantreg.
-These definitions are in the `flu_forecaster_config.R` file.
-
-- [Flu Overall](flu-overall-notebook.html)
-- [Flu AR](flu-notebook-scaled_pop_main.html)
-- [Flu AR with augmented data](flu-notebook-scaled_pop_data_augmented.html)
-- [Flu AR with exogenous features](flu-notebook-scaled_pop_exogenous.html)
-- [Flu AR with different seasonal schemes](flu-notebook-scaled_pop_season.html)
-- [Flu AR with augmented data and with different seasonal window sizes](flu-notebook-season_window_sizes.html)
-- [Flu AR with augmented data, exogenous features, and seasonal windowing](flu-notebook-scaled_pop_season_exogenous.html)
-
-Simplistic/low data methods:
-
-- [Flu no recent](flu-notebook-no_recent_quant.html)
-- [Flu flatline](flu-notebook-flatline.html)
-- [Flu climate](flu-notebook-climate_linear.html)
-
-### Covid
-
-All forecasters population scale their data, use geo pooling, and train using quantreg.
-These definitions are in the `covid_forecaster_config.R` file.
-
-- [Covid AR](covid-notebook-scaled_pop_main.html)
-- [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
-- [Covid AR with exogenous features](covid-notebook-scaled_pop_exogenous.html)
-- [Covid Flatline](covid-notebook-flatline_forecaster.html)
-
-Simplistic/low data methods:
-
-- [Covid no recent](covid-notebook-no_recent_quant.html)
-- [Covid flatline](covid-notebook-flatline.html)
-- [Covid climate](covid-notebook-climate_linear.html)
+## Backtesting on 2023-2024 Season
+
+- [Exploration Summary](exploration_summary_2024.html)
+- Flu
+  - All forecasters population scale their data, use geo pooling, and train using quantreg.
+  - These definitions are in the `flu_forecaster_config.R` file.
+  - [Flu Overall](flu-overall-notebook.html)
+  - [Flu AR](flu-notebook-scaled_pop_main.html)
+  - [Flu AR with augmented data](flu-notebook-scaled_pop_data_augmented.html)
+  - [Flu AR with exogenous features](flu-notebook-scaled_pop_exogenous.html)
+  - [Flu AR with different seasonal schemes](flu-notebook-scaled_pop_season.html)
+  - [Flu AR with augmented data and with different seasonal window sizes](flu-notebook-season_window_sizes.html)
+  - [Flu AR with augmented data, exogenous features, and seasonal windowing](flu-notebook-scaled_pop_season_exogenous.html)
+  - Simplistic/low data methods:
+    - [Flu no recent](flu-notebook-no_recent_quant.html)
+    - [Flu no recent](flu-notebook-no_recent_quant.html)
+    - [Flu flatline](flu-notebook-flatline.html)
+    - [Flu climate](flu-notebook-climate_linear.html)
+- Covid
+  - All forecasters population scale their data, use geo pooling, and train using quantreg.
+  - These definitions are in the `covid_forecaster_config.R` file.
+  - [Covid AR](covid-notebook-scaled_pop_main.html)
+  - [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
+  - [Covid AR with exogenous features](covid-notebook-scaled_pop_exogenous.html)
+  - [Covid Flatline](covid-notebook-flatline_forecaster.html)
+  - Simplistic/low data methods:
+    - [Covid no recent](covid-notebook-no_recent_quant.html)
+    - [Covid flatline](covid-notebook-flatline.html)
+    - [Covid climate](covid-notebook-climate_linear.html)
+
+## Description of Forecaster Families
+
+The main forecaster families were:
+- Autoregressive models (AR)
+  - with seasonal features
+  - with exogenous features
+  - with augmented data
+- Climatological
+- Linear trend
+- No recent outcome
+- Flatline
+
+All the AR models had the option of population scaling, seasonal features, exogenous features, and augmented data.
+We tried all possible combinations of these features.
+All models had the option of using the `linreg`, `quantreg`, or `grf` engine.
+We found that `quantreg` gave better results than `linreg` and we had computational issues with `grf`, so we used `quantreg` the rest of the time.
+
+### Autoregressive models (AR)
 
-## Descriptions of Forecaster Families
+Internal name: `scaled_pop`.
 
-### Training Data Information
+A simple autoregressive model, which predicts using
 
-(Taken from [David's Org File](https://github.com/cmu-delphi/exploration-tooling/blob/5a6da8d0d0202da6d79a5ee8e702d4654364ce46/forecasters_description.org#flusion).)
+$$x_{t+k} = ar(x)$$
 
-Some use just NHSN, while others use historical data from ILI+ and Flusurv+ as
-additional rows in training. ILI+ and Flusurv+ have been adjusted so that the
-total for the season matches NHSN’s total. Flusurv is taken from epidata, but
-ILI+ was constructed by Evan and given to Richard. The testing date range is
-roughly the 2023 season, so October 2023 through late April 2024.
+where $x$ is the target variable and $ar(x)$ is a linear combination of the target variable's past values, which can be scaled according to each state's population or whitened according to another scheme (or both). In practice, we found that using lags (0, 7) was quite effective (with (0, 7, 14) and (0, 7, 14, 21) providing no discernible advantage), so we focused on those, so in practice our model was
 
-### Flu exogenous features
+$$x_{t+k} = x_t + x_{t-7}$$
 
-- NSSP
-  Note that this data set is possibly cheating, as we don't have revisions before April of this year, so it is using the latest data.
-  If we narrow down to `time_value`s after that, the revision behavior is
+where $k \in \{0, 7, 14, 21, 28\}$ is the forecast horizon.
 
-  ```
-  Min lag (time to first version):
-      min median     mean     max
-  7 days 7 days 7.7 days 14 days
-  Fraction of epi_key+time_values with
-  No revisions:
-  • 362 out of 954 (37.95%)
-  Quick revisions (last revision within 3 days of the `time_value`):
-  • 0 out of 954 (0%)
-  Few revisions (At most 3 revisions for that `time_value`):
-  • 946 out of 954 (99.16%)
+### Autoregressive models with seasonal features
 
-  Fraction of revised epi_key+time_values which have:
-  Less than 0.1 spread in relative value:
-  • 329 out of 592 (55.57%)
-  Spread of more than 0.1015 in actual value (when revised):
-  • 18 out of 592 (3.04%)
-  days until within 20% of the latest value:
-      min median   mean     max
-  7 days 7 days 9 days 70 days
-  ```
+Internal name: `scaled_pop_seasonal`.
 
-  So most days have some revisioning, but with fairly small total changes, with the vast majority of days being within 20% of their eventual value within a week (with some much longer exceptions, apparently).
-  So the impact of the cheating is likely small but of course hard to know.
+We tried a few different attempts at incorporating seasonal features:
 
-- Google-Symptoms
-  This dataset doesn't have revisions, but has a history of suddenly disappearing.
-  The latest value was used to simulate actually having the data; at worst, it breaks down to being the underlying forecaster.
-- NWSS and NWSS_regional
-  The originating dataset has minimal revisions, but as this is a dataset with quite a lot of processing from the underlying that involves some amount of time travel, it is unclear how much revision behavior it effectively has.
+- The approach that performed the best was using a training window that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
+- Two indicator variables that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
+- Taking the first two principal components of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
+(We found that this was not particularly effective, so we did not use it.
+Despite spending a week debugging this, we could not rule out the possibility that it was a bug.
+However, we also had mixed results from tests of this feature in very simple synthetic data cases.)
+- We also tried using the climatological median of the target variable as a feature (see below for definition of "climatological").
+- Note that unusually, the last two features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
 
-### Data Whitening
+### Autoregressive models with seasonal and exogenous features
 
-The data augmented models using ILI+ and FluSurv+ take a few different approaches to data whitening, depending on the `scale_method, center_method, nonlin_method` parameters.
+Internal name: `scaled_pop_seasonal` (with `filter_source = "nhsn"`).
 
-TODO: Add descriptions.
+These models could opt into the same seasonal features as the `scaled_pop_seasonal` forecaster, but also included exogenous features.
 
-This is more closely in line with the [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn (using a much wider quantile than the default settings there).
+#### Flu exogenous features
 
-## Forecaster Families
+- NSSP - we don't have revisions before Spring 2024 for this data, so we used a revision analysis from the data collected after that date to estimate the lag (roughly 7 days) and used that lag to simulate delays.
+- Google-Symptoms - this dataset doesn't have revisions, but has a history of suddenly disappearing, resulting in intermittent long update lags.
+We did not simulate a lag and just used to latest value for a best case scenario.
+The symptom set used was s01, s03, and s04 from [here](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html).
+- NWSS - the originating dataset has minimal revisions, but as this is a dataset with quite a lot of processing from the underlying that involves some amount of time travel, so it is unclear how much revision behavior is present.
+- NWSS_regional - same as NWSS, just aggregated to the HHS region level.
 
-### AR with population scaling
+#### Covid exogenous features
 
-Internal name: `scaled_pop`.
+- NSSP - same as flu.
+- Google-Symptoms - same as flu, though we used a slightly different symtom set (just s04 and s05 from [here](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html)).
 
-A simple model, which predicts using
+### Augmented Data Forecaster
 
-$$x_{t+k} = ar(x)$$
+Internal name: `scaled_pop` (with `filter_source = ""`).
 
-where $x$ is scaled according to each state’s population.
+This forecaster is still the standard autoregressive model, but with additional training data.
+Inspired by UMass-flusion, the additional training data consisted of historical data from ILI+ and Flusurv+, which was brought to a comprable level with NHSN and treated as additional observations of the target variable (hence the name "augmented data").
+Flusurv was taken from epidata, but ILI+ was constructed by Evan Ray and given to Richard (Berkeley Summer 2024 intern).
+Naturally, this forecaster was only used for flu, as the same data was not available for covid.
 
-Three versions, two with different engines `quantreg` and `grf`, and the final one with augmented data.
+#### Scaling Parameters (Data Whitening)
 
-### AR with population scaling and seasonal features
+The augmented data forecasters took a few different approaches to data whitening (akin to [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn).
 
-Internal name: `scaled_pop_seasonal`.
-
-There are 2 seasonal features that we're trying here:
+- `scale_method`
+  - `quantile` - scales the data so that the difference between the 5th and 95th quantiles is 1
+  - `quantile_upper` - scales the data so that the 95th quantile is 1 (this was used by UMass-flusion)
+  - `std` - scales the data so that one standard deviation is 1
+  - `none` - no scaling
+  - We did not see a significant difference in changing the above parameter, so we used the default `quantile` the rest of the time.
+- `center_method`
+  - `median` - centers the data so that the median is 0
+  - `mean` - centers the data so that the mean is 0
+  - `none` - no centering
+  - We did not see a significant difference in changing the above parameter, so we used the default `median` the rest of the time.
+- `nonlin_method`
+  - `quart_root` - takes the 4th root of the data (and adds 0.01 to avoid negative values)
+  - `none` - no non-linear transformation
+  - Of these, `quart_root` gave us the best results, so we used that the rest of the time. There were occasional issues with the epsilon offset causing a positive value to become the floor as the inversion was taken.
 
-1. taking the first 3 PC components from the whitened fused data (so nhsn, ILI+, and Flusurv). (Note that it's 2 for covid).
-2. 2 indicators that roughly correspond to before, during and after the typical peak (first is true when `season_week < 16`, the second is true when `season_week > 20`, and the peak is captured by the overall constant).
-   Note that unusually, these features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
+### Climatological
 
-### Flusion-like
-
-Roughly designed in line with the flusion model.
+This was our term for a forecaster that directly forecast a distribution built from similar weeks from previous seasons (in analogy with baseline weather forecasting).
+We found that in some cases it made a reasonable baseline, though when the current season's peak time was significatly different from the seasons in the training data, it was not particularly effective.
 
 ### No Recent Outcome
 
-This is the fall-back forecaster, in case we have no data, but are forced to make a prediction.
+This was a fall-back forecaster built for the scenario where NHSN data was not going to reported in time for the start of the forecasting challenge.
 
 A flusion-adjacent model pared down to handle the case of not having the target as a predictor.
 
-$$\bar{x}_{t+k} = f(t_{season}) + p + d + \big\langle y_{t-k}\big\rangle_{k=0:1} + \big\langle y_{t-k}\big\rangle_{t=0:3}$$
-
-where $y$ here is any exogenous variables; initially this will be empty, as nssp is missing some states, so we will have to rewrite these models to handle missing geos (most likely by having a separate model for the case when an exogenous variable is missing).
+$$\bar{x}_{t+k} = \big\langle y_{t-k}\big\rangle_{k=0:1} + \big\langle y_{t-k}\big\rangle_{t=0:3}$$
 
-$f$ is either the identity or 2 sine terms, defined so that the first has half a period during the season, and is zero after it, while the second is one period over the season, with zero after
+where $y$ here is any set of exogenous variables.
 
 ### Flatline
 
-This is what the FluSight-baseline is based on, so they should be identical. However, at the moment, this has scaling issues.
+A simple "LOCF" forecaster that simply forecasts the last observed value and uses residuals to create a distributional forecast. This is what the FluSight-baseline is based on, so they should be identical.

From 5b8da7c5235a4db7d259ddda3add8156a5ca1df7 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:32:10 -0700
Subject: [PATCH 40/62] doc: add some styling to template.md

---
 reports/template.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/reports/template.md b/reports/template.md
index 967eb55e..e8a711db 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -1,3 +1,12 @@
+<style>
+body {
+  max-width: 800px;
+  margin: 2rem auto;
+  padding: 0 1rem;
+  font-family: sans-serif;
+}
+</style>
+
 # Forecast Reports
 
 [GitHub Repo](https://github.com/cmu-delphi/explorationt-tooling/)

From 245544967d12cf71c58ad65a6ce26e3b0e8fe50a Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:34:08 -0700
Subject: [PATCH 41/62] doc: minor template lint

---
 reports/template.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/reports/template.md b/reports/template.md
index e8a711db..5abbc726 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -7,7 +7,7 @@ body {
 }
 </style>
 
-# Forecast Reports
+# Delphi Forecast Reports
 
 [GitHub Repo](https://github.com/cmu-delphi/explorationt-tooling/)
 
@@ -62,6 +62,7 @@ body {
 ## Description of Forecaster Families
 
 The main forecaster families were:
+
 - Autoregressive models (AR)
   - with seasonal features
   - with exogenous features

From bbd73dd3630677057d817cf4467083ed0b3b7519 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:49:22 -0700
Subject: [PATCH 42/62] doc: more styling

---
 reports/template.md | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/reports/template.md b/reports/template.md
index 5abbc726..a962453f 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -1,9 +1,36 @@
 <style>
+/* Some basic styling (a reasonable reading width and dark mode support) */
 body {
   max-width: 800px;
   margin: 2rem auto;
   padding: 0 1rem;
   font-family: sans-serif;
+  background: white;
+  color: black;
+}
+
+a:link {
+  color: blue;
+}
+
+a:visited {
+  color: purple;
+}
+
+/* Dark mode support */
+@media (prefers-color-scheme: dark) {
+  body {
+    background: #121212;
+    color: #e0e0e0;
+  }
+
+  a:link {
+    color: #80cbc4;
+  }
+
+  a:visited {
+    color: #b39ddb; /* light purple for contrast on dark bg */
+  }
 }
 </style>
 
@@ -106,9 +133,9 @@ However, we also had mixed results from tests of this feature in very simple syn
 - We also tried using the climatological median of the target variable as a feature (see below for definition of "climatological").
 - Note that unusually, the last two features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
 
-### Autoregressive models with seasonal and exogenous features
+### Autoregressive models with exogenous features
 
-Internal name: `scaled_pop_seasonal` (with `filter_source = "nhsn"`).
+Internal name: `scaled_pop_seasonal`.
 
 These models could opt into the same seasonal features as the `scaled_pop_seasonal` forecaster, but also included exogenous features.
 
@@ -126,7 +153,7 @@ The symptom set used was s01, s03, and s04 from [here](https://cmu-delphi.github
 - NSSP - same as flu.
 - Google-Symptoms - same as flu, though we used a slightly different symtom set (just s04 and s05 from [here](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html)).
 
-### Augmented Data Forecaster
+### Autoregressive models with augmented data
 
 Internal name: `scaled_pop` (with `filter_source = ""`).
 

From 6eef6a1cd855682a7b66d6ce03ee5bfbcac4011a Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:51:59 -0700
Subject: [PATCH 43/62] doc: even more

---
 reports/template.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/reports/template.md b/reports/template.md
index a962453f..9034a60f 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -124,13 +124,13 @@ Internal name: `scaled_pop_seasonal`.
 
 We tried a few different attempts at incorporating seasonal features:
 
-- The approach that performed the best was using a training window that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
-- Two indicator variables that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
-- Taking the first two principal components of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
+- The approach that performed the best was using a *seasonal training window* that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
+- Two *indicator variables* that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
+- Taking the first two *principal components* of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
 (We found that this was not particularly effective, so we did not use it.
 Despite spending a week debugging this, we could not rule out the possibility that it was a bug.
 However, we also had mixed results from tests of this feature in very simple synthetic data cases.)
-- We also tried using the climatological median of the target variable as a feature (see below for definition of "climatological").
+- We also tried using the *climatological median* of the target variable as a feature (see below for definition of "climatological").
 - Note that unusually, the last two features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
 
 ### Autoregressive models with exogenous features

From 71beaf00521c46832057a6ab17cab76576ce684f Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 29 Apr 2025 15:59:51 -0700
Subject: [PATCH 44/62] doc: lint revision summary

---
 .../reports/revision_summary_report_2025.Rmd  | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
index a161684b..6b3d0db2 100644
--- a/scripts/reports/revision_summary_report_2025.Rmd
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -1,6 +1,5 @@
 ---
-title: "Revision summary 2025"
-author: Delphi Forecast Team
+title: "Revision Summary 2025"
 date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
 output:
   html_document:
@@ -8,10 +7,8 @@ output:
     toc: True
     # self_contained: False
     # lib_dir: libs
-params:
-  disease: "covid"
-  scores: ""
-  forecast_dates: ""
+editor_options:
+  chunk_output_type: console
 ---
 
 ```{css, echo=FALSE}
@@ -28,18 +25,17 @@ body .main-container {
 }
 ```
 
-```{r echo=FALSE}
+$$\\[.4in]$$
+
+```{r echo=FALSE, warning=FALSE,message=FALSE}
 knitr::opts_chunk$set(
   fig.align = "center",
-  message = TRUE,
+  message = FALSE,
   warning = FALSE,
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
-```
-
-```{r setup, include=FALSE}
-suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
+source(here::here("R/load_all.R"))
 ```
 
 # Overall takeaways
@@ -127,7 +123,7 @@ nhsn_archive_flu %>%
   autoplot(value, .facet_filter = geo_value %in% (data_substitutions$geo_value %>% unique())) + geom_point(data = data_substitutions, aes(x = time_value, y = value)) + facet_wrap(~geo_value, scale = "free")
 ```
 
-which doesn't look all that great. 
+which doesn't look all that great.
 To calculate how much closer (or further) we were from the final value, first we construct the relevant snapshots:
 ```{r}
 final_values <- nhsn_archive_flu %>% epix_as_of_current() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
@@ -152,7 +148,7 @@ full_table <- data_substitutions %>%
   ) %>%
   rename(as_of_value = value) %>%
   mutate()
-  
+
 diffs <- full_table %>%
   mutate(
     abs_diff = value_substituted - value_final,
@@ -248,7 +244,7 @@ nhsn_archive_covid <- nhsn_archive_covid$DT %>% filter(time_value >= "2024-11-19
   filter(geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
 nhsn_archive_covid$time_type <- "day"
 revision_summary <- nhsn_archive_covid %>%
-  epiprocess::revision_analysis(value, min_waiting_period = NULL) 
+  epiprocess::revision_analysis(value, min_waiting_period = NULL)
 revision_summary %>% print(quick_revision = 7)
 ```
 
@@ -340,7 +336,7 @@ full_table <- data_substitutions %>%
   ) %>%
   rename(as_of_value = value) %>%
   mutate()
-  
+
 diffs <- full_table %>%
   mutate(
     abs_diff = value_substituted - value_final,

From 02668e141f6ea60ab2df985727278b471e88bae6 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 29 Apr 2025 17:52:37 -0500
Subject: [PATCH 45/62] latest forecast

---
 scripts/covid_hosp_prod.R | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index c6d4b8c2..52e2bf5c 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -91,13 +91,14 @@ g_windowed_seasonal_extra_sources <- function(epi_data, ahead, extra_data, ...)
   fcst
 }
 g_forecaster_params_grid <- tibble(
-  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged"),
-  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged")),
+  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged", "seasonal_nssp_latest"),
+  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged", "g_windowed_seasonal_extra_sources")),
   params = list(
     list(),
     list(),
     list(),
     list(),
+    list(),
     list()
   ),
   param_names = list(
@@ -105,6 +106,7 @@ g_forecaster_params_grid <- tibble(
     list(),
     list(),
     list(),
+    list(),
     list()
   )
 )
@@ -175,20 +177,32 @@ forecast_targets <- tar_map(
   tar_target(
     name = forecast_res,
     command = {
-      nhsn_data <- nhsn_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end)) %>%
+      # if the forecaster is named latest, it should use the most up to date
+      # version of the data
+      if (grepl("latest", id)) {
+        nhsn_data <- nhsn_archive_data %>%
+          epix_as_of(nhsn_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(nssp_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+      } else {
+        nhsn_data <- nhsn_archive_data %>%
+        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      }
+      nhsn_data <- nhsn_data %>%
         add_season_info() %>%
         mutate(
           geo_value = ifelse(geo_value == "usa", "us", geo_value),
           time_value = time_value - 3
         ) %>%
-        data_substitutions(covid_data_substitutions, as.Date(forecast_generation_date_int)) %>%
         filter(geo_value %nin% g_insufficient_data_geos)
+      if (!grepl("latest", id)) {
+        nhsn_data %<>%
+          data_substitutions(covid_data_substitutions, as.Date(forecast_generation_date_int))
+      }
       attributes(nhsn_data)$metadata$as_of <- as.Date(forecast_date_int)
 
-      nssp_data <- nssp_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
-
       forecaster_fn <- get_partially_applied_forecaster(forecaster, aheads, params, param_names)
 
       forecaster_fn(nhsn_data, extra_data = nssp_data) %>%

From f6822d0c241479cbea64dd29709af2be3d52bb9f Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 30 Apr 2025 12:38:14 -0500
Subject: [PATCH 46/62] latest fcst needs -1 ahead not present

---
 R/forecasters/data_validation.R                | 10 ++++++++++
 R/forecasters/forecaster_scaled_pop_seasonal.R |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
index 9cbd7bf1..40fce850 100644
--- a/R/forecasters/data_validation.R
+++ b/R/forecasters/data_validation.R
@@ -97,6 +97,16 @@ filter_extraneous <- function(epi_data, filter_source, filter_agg_level) {
   return(epi_data)
 }
 
+#' the minus one ahead causes problems for `quantile_regression` if that data is
+#' actually present, so we should filter it out
+filter_minus_one_ahead <- function(epi_data, ahead) {
+  if (ahead < 0) {
+    dont_include <- attr(epi_data, "metadata")$as_of + ahead
+    epi_data %<>% filter(time_value < dont_include)
+  }
+  epi_data
+}
+
 #' Unwrap an argument if it's a list of length 1
 #'
 #' Many of our arguments to the forecasters come as lists not because we expect
diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
index 7d96bd76..5091de92 100644
--- a/R/forecasters/forecaster_scaled_pop_seasonal.R
+++ b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -76,6 +76,8 @@ scaled_pop_seasonal <- function(
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
   # this is a temp fix until a real fix gets put into epipredict
   epi_data <- clear_lastminute_nas(epi_data, cols = c(outcome, extra_sources))
+  # predicting the -1 ahead when it is present sometimes lead to freezeing
+  epi_data %<>% filter_minus_one_ahead(ahead)
   # this next part is basically unavoidable boilerplate you'll want to copy
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this

From ffd1c9ac7cfe5e9704b643b2f02a4dd258c97538 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 30 Apr 2025 17:30:39 -0500
Subject: [PATCH 47/62] adding latest to flu

---
 scripts/flu_hosp_prod.R | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 1406357c..7149a29d 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -103,13 +103,14 @@ g_windowed_seasonal_extra_sources <- function(epi_data, ahead, extra_data, ...)
   fcst
 }
 g_forecaster_params_grid <- tibble(
-  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged"),
-  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged")),
+  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged", "seasonal_nssp_latest"),
+  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged", "g_windowed_seasonal_extra_sources")),
   params = list(
     list(),
     list(),
     list(),
     list(),
+    list(),
     list()
   ),
   param_names = list(
@@ -117,6 +118,7 @@ g_forecaster_params_grid <- tibble(
     list(),
     list(),
     list(),
+    list(),
     list()
   )
 )
@@ -203,18 +205,27 @@ forecast_targets <- tar_map(
     full_data,
     command = {
       # Train data
+      if (grepl("latest", id)) {
+        train_data <- nhsn_archive_data %>%
+          epix_as_of(nhsn_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+      } else {
       train_data <- nhsn_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end)) %>%
+        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end))
+      }
+    train_data %<>%
         add_season_info() %>%
         mutate(
           geo_value = ifelse(geo_value == "usa", "us", geo_value),
           time_value = time_value - 3,
           source = "nhsn"
-        ) %>%
-        data_substitutions(
-          flu_data_substitutions,
-          as.Date(forecast_generation_date_int)
-        ) %>%
+        )
+    if (!grepl("latest", id)) {
+    train_data %<>% data_substitutions(
+                      flu_data_substitutions,
+                      as.Date(forecast_generation_date_int)
+                    )
+    }
+     train_data %<>%
         filter(geo_value %nin% g_insufficient_data_geos)
       attributes(train_data)$metadata$as_of <- as.Date(forecast_date_int)
 
@@ -228,8 +239,14 @@ forecast_targets <- tar_map(
   tar_target(
     name = forecast_res,
     command = {
-      nssp_data <- nssp_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      if (grepl("latest", id)) {
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(nssp_archive_data$versions_end) %>%
+          filter(time_value < as.Date(forecast_date_int))
+      } else {
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      }
 
       forecaster_fn <- get_partially_applied_forecaster(forecaster, aheads, params, param_names)
 

From 93d9188001b4875e1bc56b53864ef7e431b9bbfa Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 30 Apr 2025 17:33:52 -0500
Subject: [PATCH 48/62] `latest` results, takeaways

---
 scripts/reports/season_summary_2025.Rmd | 133 ++++++++----------------
 1 file changed, 43 insertions(+), 90 deletions(-)

diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 11e15d27..968cf340 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -43,7 +43,7 @@ library(DT)
 # Define aggregation functions
 Mean <- function(x) mean(x, na.rm = TRUE)
 GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
-our_forecasters <- c("linear", "windowed_seasonal", "windowed_seasonal_nssp", "climate_base", "climate_geo_agged", "climate_linear", "ensemble_windowed", "retro_submission", "CMU-TimeSeries")
+our_forecasters <- c("linear", "windowed_seasonal", "windowed_seasonal_nssp", "climate_base", "climate_geo_agged", "climate_linear", "ensemble_windowed", "retro_submission", "CMU-TimeSeries", "seasonal_nssp_latest")
 flu_scores <-
   qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores")) %>%
   mutate(forecaster = case_match(
@@ -197,6 +197,8 @@ Generally, more teams were under 50% coverage than over, so there is also a syst
 
 ## Flu Scores
 
+Note that `seasonal_nssp_latest` uses the latest version of the data, to see how much better or worse our forecasts might be if we could get a correct estimate of the revisions.
+
 Before we get into the actual scores, we need to define how we go about creating 3 different phases.
 They are `increasing`, `peak`, and `decreasing`.
 Roughly, `peak` is the interval where the value is within 50% of the max and the other two are before and after.
@@ -544,6 +546,7 @@ Our best forecasters cluster around a mean wis between 124 and 132, with our act
 The absolute best forecaster this season was `Psi-PROF`, with a `mean_wis` of 113, which is within 10% of `CMU-TimeSeries`.
 While we couldn't have improved our `mean_wis`, we could have improved our population normalized mean wis `pop_norm_wis` by using `windowed_seasonal_nssp` by ~10% as well.
 
+
 Absolute error before and after population normalizing has a similar story, though the order for models from other labs changes, and our relative score improves.
 
 Using `mean_cov_50`, our 50% coverage is generally good, with most models within 10 percentage points of 50%.
@@ -552,6 +555,11 @@ There's quite a bit of variability in other's models, with most models having to
 Using `mean_cov_90`, most of these models have quantiles that are too narrow, though `CMU-TimeSeries` is within 5 percentage points; `climate_geo_agged` has a 90% coverage, but it is otherwise quite inaccurate.
 `retro_submission` does surprisingly well on this metric by hitting 90% exactly.
 
+
+Note that `seasonal_nssp_latest` is better than the other models by any metric, suggesting that revision information would be quite useful for forecasting, if we could get a hold of it. 
+It is otherwise identical to `windowed_seasonal_nssp`, so it improves by ~20% and has better coverage.
+Looking at the Phase tab, it does especially well during the `increasing` and `peak` phases, with about the same performance as `windowed_seasonal_nssp` in the `decreasing` phase.
+
 #### Phase
 
 Breaking up the scoring by phase, most forecasters cluster together pretty tightly, with only `FluSight-baseline`, `linear`, and both `climate` only models having appreciably worse scores.
@@ -695,7 +703,8 @@ covid_score_summary <- covid_scores %>%
     n = n()
   ) %>%
   arrange(mean_wis) %>%
-  rename(id = forecaster) %>%
+  rename(id = forecaster)
+covid_score_summary %>%
   datatable()
 ```
 
@@ -810,7 +819,7 @@ We have a separate plot for `climate_geo_agged` because it does so poorly on the
 If you want to see a non-population scaled version of this, switch `y = pop_norm_wis` to `y = mean_wis` below, and comment out the `climate_geo_agged` filter.
 
 ```{r covid_plot_geo_agged, fig.height = 8, fig.width = 12}
-pop_wis_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(forecaster)
+pop_wis_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
 score_geo <- covid_scores %>%
   group_by(forecaster, geo_value) %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
@@ -868,13 +877,15 @@ Unlike in Flu, the `flat` phase exists, and roughly resembles `decreasing` in di
 `climate_base` is the closest any of these scores have come to normally distributed.
 `climate_geo_agged` is particularly bad for Covid.
 
-```{r, fig.height = 23, fig.width = 13}
+```{r covid_plot_score_histogram, fig.height = 23, fig.width = 13}
+pop_score_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+wis_score_order <- covid_score_summary %>% arrange(mean_wis) %>% pull(id)
 covid_scores %>%
   left_join(covid_within_max, by = "geo_value") %>%
   left_join(state_census, by = join_by(geo_value == abbr)) %>%
   mutate(wis = wis * 1e5/pop) %>%
   mutate(pop = factor(pop)) %>%
-  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  mutate(forecaster = factor(forecaster, levels = pop_score_order)) %>%
   group_by(forecaster) %>%
   mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
   ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
@@ -927,9 +938,10 @@ ggplotly(covid_forecast_plt)
 
 ### Results
 
-One peculiar thing about Covid scoring: on the first forecast date, CMU-TimeSeries has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab below).
+One peculiar thing about Covid scoring: on the first forecast date, CMU-TimeSeries has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab above).
 There are two related issues here:
-- first, our initial model combined climate_base and linear, and the climate_base component was unusually bad early in the season, because this season started later than previous seasons,
+
+- first, our initial model simply averaged `climate_base` and `linear`, and the `climate_base` component was unusually bad early in the season, because this season started later than previous seasons,
 - second, the data had substantial revisions (this is discussed in detail in [this notebook](first_day_wrong.html)), however this effect is much smaller, since other forecasters had access to the same data.
 
 This mishap dragged the CMU-TimeSeries score down overall by quite a lot and its better performance later in the season is not enough to make up for it.
@@ -945,6 +957,8 @@ Given the unusual nature of the season, this is somewhat unsurprising.
 To some degree this explains the poor performance of `CMU-TimeSeries`.
 You can see this by looking at the "Scores Aggregated By Forecast Date" tab, where the first 3 weeks of `CMU-TimeSeries` are significantly worse than `climate_linear`, let alone the ensemble or our best models.
 
+`seasonal_nssp_latest`, which has access to the latest data, doesn't have a significantly different score from `windowed_seasonal_nssp`, which it is otherwise identical to.
+
 #### Aggregated by phase
 
 There are two tabs dedicated to this, one with and one without a separate `flat` phase, which labels an entire state as `flat` if the duration of the `peak` is too long.
@@ -984,13 +998,7 @@ We can probably improve our forecasts by incorporating revision behavior for bot
 Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
 We should look into the extra columns to see if it provides useful information for handling revision behavior.
 
-
-## Data substitution
-
-In short, this didn't go well.
-It was a coin toss for covid, and worse than not doing corrections for flu.
-
-## Revision examples
+### Revision examples
 
 ```{r}
 nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
@@ -1024,84 +1032,11 @@ nhsn_archive_covid %>% autoplot(value, .facet_filter = geo_value %in% av_re_spre
   labs(title = "Covid revisions for the highest mean relative spread")
 ```
 
+### Data substitution
 
-# Appendix
-## Methods of selecting season phase
-There's a lot of flexibility in this decision.
-Here's some alternatives that we looked at.
-
-```{r}
-flu_gr <- flu_current %>%
-  group_by(geo_value) %>%
-  mutate(gr = growth_rate(value, method = "linear_reg", h = 3)) %>%
-  filter(time_value > "2024-11-01")
-flu_gr %>% autoplot(gr, .facet_by = "geo_value")
-flu_max_dates <- flu_current %>%
-  group_by(geo_value) %>%
-  slice_max(value) %>%
-  select(geo_value, time_value_max = time_value)
-flu_peak_season <- flu_max_dates %>% ungroup() %>% summarize(peak_start = min(time_value_max), peak_end = max(time_value_max)) %>% pivot_longer(cols = c(peak_start, peak_end))
-flu_gr %>%
-  mutate(value = value / max(value)) %>%
-  mutate(neg = gr < 0) %>%
-  group_by(geo_value) %>%
-  arrange(desc(time_value)) %>%
-  mutate(count_so_far = TRUE, count_so_far = cumsum(count_so_far), frac_neg = (cumsum(neg)) / sum(neg)) %>%
-  arrange(geo_value) %>%
-  left_join(flu_max_dates, by = "geo_value") %>%
-  ggplot(aes(x = time_value, y = frac_neg)) +
-  geom_point() +
-  geom_line(aes(y = gr)) +
-  geom_line(aes(y = value)) +
-  geom_vline(data = flu_peak_season, aes(xintercept = value)) +
-  facet_wrap(~geo_value, scale = "free")
-flu_gr %>%
-  left_join(flu_max_dates, by = "geo_value") %>%
-  filter(time_value > time_value_max) %>%
-  arrange(time_value)
-```
-
-```{r}
-flu_current %>%
-  filter(time_value > "2024-11-01") %>%
-  autoplot(value, .facet_by = "geo_value") +
-  geom_vline(data = flu_within_max, aes(xintercept = first_above)) +
-  geom_vline(data = flu_within_max, aes(xintercept = last_above)) +
-  geom_vline(data = flu_peak_season, aes(xintercept = value, color = "green")) +
-  facet_wrap(~geo_value, scale = "free")
-```
-
-```{r}
-covid_gr <- covid_archive %>%
-  epix_as_of_current() %>%
-  filter(geo_value %nin% c("as", "gu", "mp")) %>%
-  group_by(geo_value) %>%
-  mutate(gr = growth_rate(value, method = "linear_reg", h = 3)) %>%
-  filter(time_value > "2024-09-01")
-covid_gr %>% autoplot(gr, .facet_by = "geo_value")
-```
+In short, this didn't go well.
+It was a coin toss for covid, and worse than not doing corrections for flu.
 
-```{r}
-covid_gr %>%
-  mutate(value = value / max(value)) %>%
-  mutate(neg = gr < 0) %>%
-  group_by(geo_value) %>%
-  arrange(desc(time_value)) %>%
-  mutate(count_so_far = TRUE, count_so_far = cumsum(count_so_far), frac_neg = (cumsum(neg)) / sum(neg)) %>%
-  arrange(geo_value) %>%
-  ggplot(aes(x = time_value, y = frac_neg)) +
-  geom_point() +
-  geom_line(aes(y = gr)) +
-  geom_line(aes(y = value)) +
-  facet_wrap(~geo_value, scale = "free")
-covid_max_dates <- covid_gr %>%
-  slice_max(gr) %>%
-  select(geo_value, time_value_max = time_value)
-covid_gr %>%
-  left_join(covid_max_dates, by = "geo_value") %>%
-  filter(time_value > time_value_max) %>%
-  arrange(time_value)
-```
 
 [^1]: this is off from our local version of the scoring by .6, which is nonzero but not appreciably different.
       It's scored on N=4160 vs the local 3692, which probably comes down to negative aheads.
@@ -1109,3 +1044,21 @@ covid_gr %>%
 
 [^2]: this is further off both in absolute and further yet in relative terms from our local scoring, which has `CMU-TimeSeries` at 46.32 rather than 44.8.
       It's unclear why; there are 3952 samples scored on the remote vs 3692 locally, so ~300 scored there that we don't score where we apparently did better.
+
+# Further methods to explore
+
+The [decreasing forecasters notebook](decreasing_forecasters.html) has a number of suggestions, though that is a problem that occurs most frequently with Flu data rather than Covid.
+The broad categories there are 
+
+1. Filtering to the relevant phase; `windowed_seasonal_*` is roughly an example of this, which is likely why it outperformed simple AR by enough to be better.
+2. better use of non-linear models. This would allow us to capture increasing, decreasing, and flat trends in the same model.
+3. Explicitly using the growth rate as a co-variate, or fitting on differences rather than the raw value.
+
+A perhaps unexpected direction that came out of that notebook is some sort of non-linear per-state scaling; we found that scaling the counts by population^2 (or scaling rates by population) eliminated the decreasing forecasts problem.
+Hopefully the Yeo-Johnson step we have been working on will be able to choose an appropriate scaling factor to more systematically recreate this.
+
+Given the high utility of NSSP this season, the most important thing we can do is look for useful leading exogenous variables.
+In addition to signals in epidata and processed versions of those, NHSN releases a large number of accompanying variables which we should consider more explicitly.
+
+In addition to that, incorporating revision behavior would likely yield some results.
+The performance of `seasonal_nssp_latest` on Flu supports this, though it's performance on Covid was surprisingly lackluster.

From 89ffe08fc0cee9015aa07d22378d32eeb5c60b35 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 14:31:41 -0700
Subject: [PATCH 49/62] doc+lint: add future work section, caveat diffed tests,
 improve forecaster descriptions

---
 R/imports.R                                   |  3 ++
 reports/template.md                           | 39 ++++++++++-------
 scripts/reports/climatological_model.Rmd      | 18 +-------
 .../reports/climatological_model_covid.Rmd    | 18 +-------
 scripts/reports/comparison-notebook.Rmd       | 15 +------
 scripts/reports/decreasing_forecasters.Rmd    |  4 +-
 .../reports/overall-comparison-notebook.Rmd   | 36 +++++-----------
 .../reports/revision_summary_report_2025.Rmd  |  2 +-
 scripts/reports/score_report.Rmd              |  4 +-
 scripts/reports/season_summary_2025.Rmd       | 43 ++++++++++++-------
 10 files changed, 74 insertions(+), 108 deletions(-)

diff --git a/R/imports.R b/R/imports.R
index 94809ecd..d72fcb02 100644
--- a/R/imports.R
+++ b/R/imports.R
@@ -9,6 +9,7 @@ library(crew)
 library(data.table)
 library(dplyr)
 library(DT)
+options(DT.options = list(scrollX = TRUE))
 library(epidatr)
 library(epipredict)
 library(epiprocess)
@@ -16,6 +17,7 @@ library(ggplot2)
 library(glue)
 library(grf)
 library(here)
+library(httpgd)
 if (Sys.getenv("COVID_SUBMISSION_DIRECTORY", "cache") != "cache") {
   library(hubValidations)
 }
@@ -36,6 +38,7 @@ library(recipes)
 library(renv)
 library(rlang)
 library(rspm)
+library(scales)
 library(scoringutils)
 library(slider)
 library(stringr)
diff --git a/reports/template.md b/reports/template.md
index 9034a60f..8d54f95b 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -40,9 +40,9 @@ a:visited {
 
 ## Overview
 
-- The weekly fanplots were used by the team for visual inspections of the forecasts.
-- The season reports provide a general analysis of the season's data and forecasts performance.
-- The backtesting reports provide a detailed analysis of a wide variety of forecasters' performance on the previous season's data.
+- The weekly fanplots were used by the team to visually inspect the forecasts.
+- The season reports provide a general analysis of the 2024-2025 season's data and forecaster performance.
+- The backtesting reports were pre-season tests of a variety of forecasters on the 2023-2024 season's data.
 - A description of the forecaster families explored is provided at the bottom of the page.
 
 ## Weekly Fanplots 2024-2025 Season
@@ -56,9 +56,9 @@ a:visited {
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
-## Backtesting on 2023-2024 Season
+## 2023-2024 Season Backtesting
 
-- [Exploration Summary](exploration_summary_2024.html)
+- [Forecaster Exploration Summary](exploration_summary_2024.html)
 - Flu
   - All forecasters population scale their data, use geo pooling, and train using quantreg.
   - These definitions are in the `flu_forecaster_config.R` file.
@@ -77,6 +77,7 @@ a:visited {
 - Covid
   - All forecasters population scale their data, use geo pooling, and train using quantreg.
   - These definitions are in the `covid_forecaster_config.R` file.
+  - [Covid Overall](covid-overall-notebook.html)
   - [Covid AR](covid-notebook-scaled_pop_main.html)
   - [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
   - [Covid AR with exogenous features](covid-notebook-scaled_pop_exogenous.html)
@@ -94,10 +95,12 @@ The main forecaster families were:
   - with seasonal features
   - with exogenous features
   - with augmented data
-- Climatological
-- Linear trend
-- No recent outcome
-- Flatline
+- "Ad-hoc" models
+  - Climatological
+  - Linear trend
+  - No recent outcome
+- Baseline models
+  - Flatline
 
 All the AR models had the option of population scaling, seasonal features, exogenous features, and augmented data.
 We tried all possible combinations of these features.
@@ -124,13 +127,13 @@ Internal name: `scaled_pop_seasonal`.
 
 We tried a few different attempts at incorporating seasonal features:
 
-- The approach that performed the best was using a *seasonal training window* that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
-- Two *indicator variables* that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
-- Taking the first two *principal components* of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
+- The approach that performed the best was using a **seasonal training window** that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
+- Two **indicator variables** that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
+- Taking the first two **principal components** of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
 (We found that this was not particularly effective, so we did not use it.
 Despite spending a week debugging this, we could not rule out the possibility that it was a bug.
 However, we also had mixed results from tests of this feature in very simple synthetic data cases.)
-- We also tried using the *climatological median* of the target variable as a feature (see below for definition of "climatological").
+- We also tried using the **climatological median** of the target variable as a feature (see below for definition of "climatological").
 - Note that unusually, the last two features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
 
 ### Autoregressive models with exogenous features
@@ -164,10 +167,10 @@ Naturally, this forecaster was only used for flu, as the same data was not avail
 
 #### Scaling Parameters (Data Whitening)
 
-The augmented data forecasters took a few different approaches to data whitening (akin to [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn).
+We tried a few different approaches to data whitening.
 
 - `scale_method`
-  - `quantile` - scales the data so that the difference between the 5th and 95th quantiles is 1
+  - `quantile` - scales the data so that the difference between the 5th and 95th quantiles is 1 (akin to [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn)
   - `quantile_upper` - scales the data so that the 95th quantile is 1 (this was used by UMass-flusion)
   - `std` - scales the data so that one standard deviation is 1
   - `none` - no scaling
@@ -180,13 +183,17 @@ The augmented data forecasters took a few different approaches to data whitening
 - `nonlin_method`
   - `quart_root` - takes the 4th root of the data (and adds 0.01 to avoid negative values)
   - `none` - no non-linear transformation
-  - Of these, `quart_root` gave us the best results, so we used that the rest of the time. There were occasional issues with the epsilon offset causing a positive value to become the floor as the inversion was taken.
+  - Of these, `quart_root` gave us the best results, so we used that the rest of the time (beware: the epsilon offset can interact poorly with forecasts clipped to be non-negative).
 
 ### Climatological
 
 This was our term for a forecaster that directly forecast a distribution built from similar weeks from previous seasons (in analogy with baseline weather forecasting).
 We found that in some cases it made a reasonable baseline, though when the current season's peak time was significatly different from the seasons in the training data, it was not particularly effective.
 
+### Linear Trend
+
+A simple linear trend model that predicts the median using linear extrapolation from the past 4 weeks of data and then uses residuals to create a distributional forecast.
+
 ### No Recent Outcome
 
 This was a fall-back forecaster built for the scenario where NHSN data was not going to reported in time for the start of the forecasting challenge.
diff --git a/scripts/reports/climatological_model.Rmd b/scripts/reports/climatological_model.Rmd
index 72da0fef..fc990193 100644
--- a/scripts/reports/climatological_model.Rmd
+++ b/scripts/reports/climatological_model.Rmd
@@ -20,26 +20,10 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r}
-library(dplyr)
-library(ggplot2)
-library(lubridate)
-library(MMWRweek)
-library(readr)
-library(rlang)
-library(magrittr)
-library(aws.s3)
-library(data.table)
-library(dplyr)
-library(DT)
-library(ggplot2)
-library(plotly)
-library(readr)
-library(stringr)
-library(tidyr)
-suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 forecast_date <- as.Date("2024-11-20")
 epi_data <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfflunewadm")
 epi_data <- epi_data %>% add_season_info()
diff --git a/scripts/reports/climatological_model_covid.Rmd b/scripts/reports/climatological_model_covid.Rmd
index 94837ae6..b833afcb 100644
--- a/scripts/reports/climatological_model_covid.Rmd
+++ b/scripts/reports/climatological_model_covid.Rmd
@@ -20,26 +20,10 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r}
-library(dplyr)
-library(ggplot2)
-library(lubridate)
-library(MMWRweek)
-library(readr)
-library(rlang)
-library(magrittr)
-library(aws.s3)
-library(data.table)
-library(dplyr)
-library(DT)
-library(ggplot2)
-library(plotly)
-library(readr)
-library(stringr)
-library(tidyr)
-suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 forecast_date <- as.Date("2024-11-20")
 epi_data <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm")
 epi_data <- epi_data %>% add_season_info()
diff --git a/scripts/reports/comparison-notebook.Rmd b/scripts/reports/comparison-notebook.Rmd
index e315fffa..42fd478a 100644
--- a/scripts/reports/comparison-notebook.Rmd
+++ b/scripts/reports/comparison-notebook.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "`r params$forecaster_family`: evaluation on 2023/24 in 2024/25"
-date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
 output:
   html_document:
     code_folding: hide
@@ -25,18 +25,7 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
-```
-
-```{r}
-library(data.table)
-library(dplyr)
-library(DT)
-library(ggplot2)
-library(plotly)
-library(readr)
-library(stringr)
-library(tidyr)
-library(purrr)
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r}
diff --git a/scripts/reports/decreasing_forecasters.Rmd b/scripts/reports/decreasing_forecasters.Rmd
index 8b7c4349..69598c31 100644
--- a/scripts/reports/decreasing_forecasters.Rmd
+++ b/scripts/reports/decreasing_forecasters.Rmd
@@ -1,7 +1,7 @@
 ---
 title: "Decreasing Forecasters"
 author: Delphi Forecast Team
-date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
 output:
   html_document:
     code_folding: hide
@@ -773,6 +773,8 @@ forecast %>% plot_dec_forecasts(default_geos)
 
 Turns out, we get better coverage this way.
 
+Note that this is a very naive implementation for the quantiles: each quantile is computed independently, while a more correct approach would convolve the one-ahead quantile distribution of the diffs repeatedly with itself.
+
 Now let's try training on the augmented data (fluview and ILI).
 
 ```{r, eval = FALSE}
diff --git a/scripts/reports/overall-comparison-notebook.Rmd b/scripts/reports/overall-comparison-notebook.Rmd
index a7fcd02b..1c9a7916 100644
--- a/scripts/reports/overall-comparison-notebook.Rmd
+++ b/scripts/reports/overall-comparison-notebook.Rmd
@@ -1,6 +1,6 @@
 ---
 title: Overall Comparison of Hospitalization Forecasters
-date: "`r format(Sys.time(), '%d %B %Y')`"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
 output:
   html_document:
     code_folding: hide
@@ -24,28 +24,17 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r}
-library(data.table)
-library(dplyr)
-library(DT)
-library(ggplot2)
-library(plotly)
-library(readr)
-library(stringr)
-library(tidyr)
-library(purrr)
-```
-
-```{r}
-# params <- list(
-#   forecaster_parameters = tar_read(forecaster_parameter_combinations),
-#   forecasts = tar_read(joined_forecasts),
-#   scores = tar_read(joined_scores),
-#   truth_data = tar_read(hhs_evaluation_data),
-#   disease = "covid"
-# )
+params <- list(
+  forecaster_parameters = tar_read(forecaster_parameter_combinations),
+  forecasts = tar_read(joined_forecasts),
+  scores = tar_read(joined_scores),
+  truth_data = tar_read(hhs_evaluation_data),
+  disease = "flu"
+)
 
 if (params$disease == "flu") {
   base_forecaster_name <- "FluSight-baseline"
@@ -66,13 +55,10 @@ id_to_forecaster <- params$forecaster_parameters %>%
   bind_rows() %>%
   rename(forecaster_function = forecaster)
 
-Mean <- function(x) mean(x, na.rm = TRUE)
-GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
-
 overall_rating <- params$scores %>%
-  summarize(mean_score = Mean(wis), .by = forecaster) %>%
-  filter(mean_score > 20) %>%
+  summarize(mean_score = mean(wis), .by = forecaster) %>%
   arrange(mean_score) %>%
+  filter(mean_score > 20) %>%
   left_join(id_to_forecaster, by = join_by(forecaster == id)) %>%
   mutate(
     family_name = if_else(is.na(family_name), "external", family_name),
diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
index 6b3d0db2..ddda2657 100644
--- a/scripts/reports/revision_summary_report_2025.Rmd
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -30,7 +30,7 @@ $$\\[.4in]$$
 ```{r echo=FALSE, warning=FALSE,message=FALSE}
 knitr::opts_chunk$set(
   fig.align = "center",
-  message = FALSE,
+  message = TRUE,
   warning = FALSE,
   cache = FALSE
 )
diff --git a/scripts/reports/score_report.Rmd b/scripts/reports/score_report.Rmd
index 26b4d9b0..2e4f3d5c 100644
--- a/scripts/reports/score_report.Rmd
+++ b/scripts/reports/score_report.Rmd
@@ -36,12 +36,10 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r setup, include=FALSE}
-suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
-library(DT)
-
 # Define aggregation functions
 Mean <- function(x) mean(x, na.rm = TRUE)
 GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
index 968cf340..a36ec275 100644
--- a/scripts/reports/season_summary_2025.Rmd
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "Season Summary 2024-2025"
-date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
 output:
   html_document:
     code_folding: hide
@@ -33,13 +33,10 @@ knitr::opts_chunk$set(
   cache = FALSE
 )
 ggplot2::theme_set(ggplot2::theme_bw())
-source(here::here("R/load_all.R"))
+suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r setup, include=FALSE}
-library(scales)
-library(DT)
-
 # Define aggregation functions
 Mean <- function(x) mean(x, na.rm = TRUE)
 GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
@@ -85,21 +82,26 @@ forecast_weeks_to_plot %in% (covid_scores$forecast_date %>% unique())
 ```
 
 # Models used
-One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
-For both flu and covid these seasons are either unusually large or unusually small, and don't warrant inclusion.
+
+**See [the bottom of the main reports page](https://delphi-forecasting-reports.netlify.app) for a detailed description of the models.**
+
 We can split the models and ensembles into 3 categories: the ad-hoc models that we created in response to the actual data that we saw, the AR models that we had been backtesting, and the ensembles.
+(One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
+For both flu and covid these seasons are either unusually large or unusually small, and don't warrant inclusion.)
 
 ### The "ad-hoc" models
 
-- `climate_base` uses a 7 week window around the target and forecast date to establish quantiles.
-  `climate_base` does this separately for each geo.
-- `climate_geo_agged` on the other hand converts to rates, pools all geos, computes quantiles using similar time windows, and then converts back to counts.
+Roughly, these were "climatological" and "linear trend" models.
+
+- `climate_base` matches the current target and forecast date to the same 7 epiweek windows in previous seasons to establish quantiles.
+  `climate_base` gets separate quantiles for each geo.
+- `climate_geo_agged` takes the same approach as `base`, but pools all geos (converts to rates, computes quantiles using similar time windows, and then converts back to counts).
   There is effectively only one prediction, scaled to fit each geo.
 - `linear` does a linear extrapolation of the last 4 weeks of data on a rates scale.
   Initially it had an intercept, but this was removed when it caused the model to not reproduce the -1 ahead data exactly.
   This change was made on Jan 8th, in the commit with hash 5f7892b.
   The quantiles are ad-hoc; the residuals are pooled, symmetrized, truncated using some bounds hand-selected to set the quantiles at a reasonable width, and then propagated forward using `propagate_samples` from epipredict.
-- `climate_linear` combines the `climate_*` models with the `linear` model.
+- `climate_linear` combines the `climate_*` models with the `linear` model using a special weighting scheme.
   It does two linear weightings between the linear model and the climate models.
   As the ahead goes from -1 to 4, it linearly interpolates between a 5% weight on the climate model and a 90% weight on the climate model (so the furthest ahead is mostly a climate model).
   At the same time, as the quantile level goes further away from the median, it interpolates between a 10% weight on the climate model at the median and a 100% weight on the climate model at either the 1% or 99% quantile levels.
@@ -125,13 +127,13 @@ weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead
 
 ### The AR models
 
-- `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from an 8 week window from each year.
+- `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from a matching 8 week window from each year.
   It does quartic root scaling along with quantile and median whitening.
   In addition to dropping the first 2 seasons, the windowed models drop the summers for the purposes of determining whitening behavior.
   For flu, this augments with ili and flusurv (so they are added as additional rows, with their own scaling/centering).
   Covid doesn't have a comparable dataset.
 - `windowed_seasonal_nssp` is like `windowed_seasonal`, but also has `nssp` as an exogenous component.
-  Note that for flu, this effectively means throwing out the ili and flusurv data, since `nssp` is only defined recently.
+  Note that for flu, this effectively means throwing out the ili and flusurv data, since `nssp` is only defined recently (and ili data goes back to 2000).
   For covid, `windowed_seasonal_nssp` is effectively the same model, but with auxiliary data.
 
 ### The general ensembles
@@ -556,7 +558,7 @@ Using `mean_cov_90`, most of these models have quantiles that are too narrow, th
 `retro_submission` does surprisingly well on this metric by hitting 90% exactly.
 
 
-Note that `seasonal_nssp_latest` is better than the other models by any metric, suggesting that revision information would be quite useful for forecasting, if we could get a hold of it. 
+Note that `seasonal_nssp_latest` is better than the other models by any metric, suggesting that revision information would be quite useful for forecasting, if we could get a hold of it.
 It is otherwise identical to `windowed_seasonal_nssp`, so it improves by ~20% and has better coverage.
 Looking at the Phase tab, it does especially well during the `increasing` and `peak` phases, with about the same performance as `windowed_seasonal_nssp` in the `decreasing` phase.
 
@@ -986,6 +988,17 @@ The always decreasing problem is definitely not present in these forecasts.
 If anything, our best forecasts are *too* eager to predict an increasing value, e.g. in `tx` and `ca`.
 Several of our worse forecasts are clearly caused by revision behavior.
 
+# Future Work
+
+Some ideas for future work:
+
+- Incorporate revision behavior.
+  - Nowcasting could help mitigate routine mis-reporting (see below).
+- Change data substitution approach.
+  - Would like to be able to widen uncertainty around a point rather than doing a point substitution.
+- Use the difference rather than the raw value.
+  - Following standard ARIMA practice, this should bring our time series closer to stationarity.
+  - We have some simple demonstrations of this approach forecasts in the [decreasing forecasters notebook](decreasing_forecasters.html).
 
 # Revision behavior and data substitution
 
@@ -1048,7 +1061,7 @@ It was a coin toss for covid, and worse than not doing corrections for flu.
 # Further methods to explore
 
 The [decreasing forecasters notebook](decreasing_forecasters.html) has a number of suggestions, though that is a problem that occurs most frequently with Flu data rather than Covid.
-The broad categories there are 
+The broad categories there are
 
 1. Filtering to the relevant phase; `windowed_seasonal_*` is roughly an example of this, which is likely why it outperformed simple AR by enough to be better.
 2. better use of non-linear models. This would allow us to capture increasing, decreasing, and flat trends in the same model.

From dc917a273b1782426f386f708c7b88427fbdbf97 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 14:31:54 -0700
Subject: [PATCH 50/62] wip: exploration summary

---
 scripts/reports/exploration_summary_2025.Rmd | 78 ++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 scripts/reports/exploration_summary_2025.Rmd

diff --git a/scripts/reports/exploration_summary_2025.Rmd b/scripts/reports/exploration_summary_2025.Rmd
new file mode 100644
index 00000000..272db3b3
--- /dev/null
+++ b/scripts/reports/exploration_summary_2025.Rmd
@@ -0,0 +1,78 @@
+---
+title: "Exploration Summary 2024-2025"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
+output:
+  html_document:
+    code_folding: hide
+    toc: True
+editor_options:
+  chunk_output_type: console
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+$$\\[.4in]$$
+
+```{r echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+source(here::here("R/load_all.R"))
+```
+
+In this document, we will summarize our findings from backtesting a large variety of forecasters on the 2023-2024 season.
+The forecaster family definitions can be found at the bottom of [this page](https://delphi-forecasting-reports.netlify.app/).
+
+## Best Performing Families
+
+### Flu
+
+[The best performing families](https://delphi-forecasting-reports.netlify.app/flu-overall-notebook) were:
+
+- AR with seasonal windows and the NSSP exogenous feature
+  - This forecaster was about 10 mean WIS points behind UMass-flusion, but on par with the FluSight-ensemble.
+- AR with seasonal window (same as above, but without the NSSP exogenous feature)
+  - This forecaster was only 2 mean WIS points behind the above forecaster.
+  - We explored a wide variety of parameters for this family and found that the number of weeks to include in the training window was not particularly important, so we settled on 5 weeks prior and 3 weeks ahead.
+- An ensemble of climatological and the linear trend model (we used this at the start of the season when we didn't trust the data to support a more complex model)
+  - We were surprised to find that this was only 7 mean WIS points behind our best performing family.
+- For context, the gap between our best performing family and FluSight-baseline was only about 15 mean WIS points.
+
+### Covid
+
+[The best performing families](https://delphi-forecasting-reports.netlify.app/covid-overall-notebook) were:
+
+- AR with seasonal windows and the NSSP exogenous feature
+  - This forecaster was about 10 mean WIS points behind UMass-flusion, but on par with the FluSight-ensemble.
+- Surprisingly, the `climate_linear` model was not far behind our best performing family.
+  (`climate_linear` combines the `climate_*` models with the `linear` model using a special weighting scheme.
+  See the [season summary](season_summary_2025.html) for more details.)
+
+
+
+## Important Parameters
+
+- Forecasters that used a seasonal training window were substantially better than those that did not.
+- Forecasters that used the NSSP exogenous feature were substantially better than those that did not.
+
+## Important Notes
+
+One of the most concerning behaviors in our forecasters was the bias towards predicting a down-swing in the target.
+After a deeper analysis, we concluded that this is due to a downward bias in the data set, which our linear AR models were picking up and translating into coefficients that were less than 1, making declines almost certain.
+The complete analysis can be found [here](https://delphi-forecasting-reports.netlify.app/decreasing_forecasters).

From 3799554347552f707730cdc2c5cc18f5b33855d0 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 14:36:49 -0700
Subject: [PATCH 51/62] doc: template

---
 reports/template.md | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/reports/template.md b/reports/template.md
index 8d54f95b..7f98c7c3 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -1,5 +1,4 @@
 <style>
-/* Some basic styling (a reasonable reading width and dark mode support) */
 body {
   max-width: 800px;
   margin: 2rem auto;
@@ -8,30 +7,6 @@ body {
   background: white;
   color: black;
 }
-
-a:link {
-  color: blue;
-}
-
-a:visited {
-  color: purple;
-}
-
-/* Dark mode support */
-@media (prefers-color-scheme: dark) {
-  body {
-    background: #121212;
-    color: #e0e0e0;
-  }
-
-  a:link {
-    color: #80cbc4;
-  }
-
-  a:visited {
-    color: #b39ddb; /* light purple for contrast on dark bg */
-  }
-}
 </style>
 
 # Delphi Forecast Reports

From 0d9b6bd2653c622382ed40424acfad2fb11c418e Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 14:38:34 -0700
Subject: [PATCH 52/62] fix: remove debug from overall notebook

---
 reports/template.md                             |  2 +-
 scripts/reports/overall-comparison-notebook.Rmd | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/reports/template.md b/reports/template.md
index 7f98c7c3..8c1a4cbb 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -171,7 +171,7 @@ A simple linear trend model that predicts the median using linear extrapolation
 
 ### No Recent Outcome
 
-This was a fall-back forecaster built for the scenario where NHSN data was not going to reported in time for the start of the forecasting challenge.
+This was a fall-back forecaster built for the scenario where NHSN data was not going to be reported in time for the start of the forecasting challenge.
 
 A flusion-adjacent model pared down to handle the case of not having the target as a predictor.
 
diff --git a/scripts/reports/overall-comparison-notebook.Rmd b/scripts/reports/overall-comparison-notebook.Rmd
index 1c9a7916..5c10b8c5 100644
--- a/scripts/reports/overall-comparison-notebook.Rmd
+++ b/scripts/reports/overall-comparison-notebook.Rmd
@@ -28,13 +28,13 @@ suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
 ```{r}
-params <- list(
-  forecaster_parameters = tar_read(forecaster_parameter_combinations),
-  forecasts = tar_read(joined_forecasts),
-  scores = tar_read(joined_scores),
-  truth_data = tar_read(hhs_evaluation_data),
-  disease = "flu"
-)
+# params <- list(
+#   forecaster_parameters = tar_read(forecaster_parameter_combinations),
+#   forecasts = tar_read(joined_forecasts),
+#   scores = tar_read(joined_scores),
+#   truth_data = tar_read(hhs_evaluation_data),
+#   disease = "flu"
+# )
 
 if (params$disease == "flu") {
   base_forecaster_name <- "FluSight-baseline"

From dae68be9e218bb756d248126d0705d4ac26e5bd3 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 16:10:24 -0700
Subject: [PATCH 53/62] doc: exploration summary

---
 R/targets/covid_forecaster_config.R           |  1 +
 R/targets/flu_forecaster_config.R             |  4 +-
 reports/style.css                             | 12 ++---
 reports/template.md                           | 31 ++++--------
 scripts/reports/comparison-notebook.Rmd       | 14 ++----
 ...y_2025.Rmd => exploration_summary_2025.md} | 49 +++----------------
 6 files changed, 29 insertions(+), 82 deletions(-)
 rename scripts/reports/{exploration_summary_2025.Rmd => exploration_summary_2025.md} (71%)

diff --git a/R/targets/covid_forecaster_config.R b/R/targets/covid_forecaster_config.R
index 564addff..e746ae71 100644
--- a/R/targets/covid_forecaster_config.R
+++ b/R/targets/covid_forecaster_config.R
@@ -146,6 +146,7 @@ get_covid_forecaster_params <- function() {
         residual_tail = 0.97,
         residual_center = 0.097
       ),
+      # only linear, a bunch of the parameters don't matter for it
       expand_grid(
         forecaster = "climate_linear_ensembled",
         scale_method = "none",
diff --git a/R/targets/flu_forecaster_config.R b/R/targets/flu_forecaster_config.R
index f75e8dcb..a62cf1a4 100644
--- a/R/targets/flu_forecaster_config.R
+++ b/R/targets/flu_forecaster_config.R
@@ -282,7 +282,7 @@ get_flu_forecaster_params <- function() {
         scale_method = "quantile",
         center_method = "median",
         nonlin_method = c("quart_root", "none"),
-        model_used = c("climate_linear", "linear", "climate", "climatological_forecaster"),
+        model_used = c("climate_linear", "climate", "climatological_forecaster"),
         filter_source = c("", "nhsn"),
         filter_agg_level = "state",
         drop_non_seasons = c(TRUE, FALSE),
@@ -296,7 +296,7 @@ get_flu_forecaster_params <- function() {
         scale_method = "none",
         center_method = "none",
         nonlin_method = c("quart_root", "none"),
-        model_used = c("climate_linear", "linear", "climate", "climatological_forecaster"),
+        model_used = c("climate_linear", "climate", "climatological_forecaster"),
         filter_source = "nhsn",
         filter_agg_level = "state",
         drop_non_seasons = c(TRUE, FALSE),
diff --git a/reports/style.css b/reports/style.css
index 56fecd8b..d1c11792 100644
--- a/reports/style.css
+++ b/reports/style.css
@@ -1,8 +1,8 @@
 body {
-    font-family: Arial, sans-serif;
-    margin: 2em;
-}
-.container {
-    max-width: 800px;
-    margin: auto;
+  max-width: 800px;
+  margin: 2rem auto;
+  padding: 0 1rem;
+  font-family: sans-serif;
+  background: white;
+  color: black;
 }
\ No newline at end of file
diff --git a/reports/template.md b/reports/template.md
index 8c1a4cbb..6302801b 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -1,14 +1,3 @@
-<style>
-body {
-  max-width: 800px;
-  margin: 2rem auto;
-  padding: 0 1rem;
-  font-family: sans-serif;
-  background: white;
-  color: black;
-}
-</style>
-
 # Delphi Forecast Reports
 
 [GitHub Repo](https://github.com/cmu-delphi/explorationt-tooling/)
@@ -35,8 +24,6 @@ body {
 
 - [Forecaster Exploration Summary](exploration_summary_2024.html)
 - Flu
-  - All forecasters population scale their data, use geo pooling, and train using quantreg.
-  - These definitions are in the `flu_forecaster_config.R` file.
   - [Flu Overall](flu-overall-notebook.html)
   - [Flu AR](flu-notebook-scaled_pop_main.html)
   - [Flu AR with augmented data](flu-notebook-scaled_pop_data_augmented.html)
@@ -45,13 +32,10 @@ body {
   - [Flu AR with augmented data and with different seasonal window sizes](flu-notebook-season_window_sizes.html)
   - [Flu AR with augmented data, exogenous features, and seasonal windowing](flu-notebook-scaled_pop_season_exogenous.html)
   - Simplistic/low data methods:
-    - [Flu no recent](flu-notebook-no_recent_quant.html)
     - [Flu no recent](flu-notebook-no_recent_quant.html)
     - [Flu flatline](flu-notebook-flatline.html)
-    - [Flu climate](flu-notebook-climate_linear.html)
+    - [Flu climate and linear](flu-notebook-climate_linear.html)
 - Covid
-  - All forecasters population scale their data, use geo pooling, and train using quantreg.
-  - These definitions are in the `covid_forecaster_config.R` file.
   - [Covid Overall](covid-overall-notebook.html)
   - [Covid AR](covid-notebook-scaled_pop_main.html)
   - [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
@@ -60,7 +44,7 @@ body {
   - Simplistic/low data methods:
     - [Covid no recent](covid-notebook-no_recent_quant.html)
     - [Covid flatline](covid-notebook-flatline.html)
-    - [Covid climate](covid-notebook-climate_linear.html)
+    - [Covid climate and linear](covid-notebook-climate_linear.html)
 
 ## Description of Forecaster Families
 
@@ -77,10 +61,13 @@ The main forecaster families were:
 - Baseline models
   - Flatline
 
-All the AR models had the option of population scaling, seasonal features, exogenous features, and augmented data.
-We tried all possible combinations of these features.
-All models had the option of using the `linreg`, `quantreg`, or `grf` engine.
-We found that `quantreg` gave better results than `linreg` and we had computational issues with `grf`, so we used `quantreg` the rest of the time.
+Notes:
+
+- All forecasters population scale their data, use geo pooling, and train using quantreg.
+  We found that `quantreg` gave better results than `linreg` and we had computational issues with `grf`, so we used `quantreg` the rest of the time.
+- All the AR models had the option of population scaling, seasonal features, exogenous features, and augmented data.
+  We tried all possible combinations of these features (in notebooks above).
+- The forecaster definitions are in the [`flu_forecaster_config.R`](https://github.com/cmu-delphi/exploration-tooling/blob/main/R/targets/flu_forecaster_config.R) and [`covid_forecaster_config.R`](https://github.com/cmu-delphi/exploration-tooling/blob/main/R/targets/covid_forecaster_config.R) files.
 
 ### Autoregressive models (AR)
 
diff --git a/scripts/reports/comparison-notebook.Rmd b/scripts/reports/comparison-notebook.Rmd
index 42fd478a..c378d294 100644
--- a/scripts/reports/comparison-notebook.Rmd
+++ b/scripts/reports/comparison-notebook.Rmd
@@ -593,7 +593,7 @@ forecast_subset <- params$forecasts %>%
   pivot_wider(names_from = "quantile", values_from = "prediction") %>%
   mutate(ahead = as.numeric(target_end_date - forecast_date))
 
-p <- ggplot(
+ggplot(
   data = forecast_subset,
   aes(x = target_end_date, group = forecast_date)
 ) +
@@ -615,9 +615,6 @@ p <- ggplot(
   facet_grid(factor(forecaster, levels = param_table$id) ~ geo_value, scales = "free") +
   labs(x = "Reference Date", y = "Weekly Sums of Hospitalizations", title = "Monthly Forecasts and Truth Data") +
   theme(legend.position = "none")
-
-ggplotly(p, tooltip = "text", height = 300 * length(param_table$id), width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
 ```
 
 #### Medium States
@@ -634,7 +631,7 @@ forecast_subset <- params$forecasts %>%
   pivot_wider(names_from = "quantile", values_from = "prediction") %>%
   mutate(ahead = as.numeric(target_end_date - forecast_date))
 
-p <- ggplot(
+ggplot(
   data = forecast_subset,
   aes(x = target_end_date, group = forecast_date)
 ) +
@@ -657,8 +654,6 @@ p <- ggplot(
   labs(x = "Reference Date", y = "Weekly Sums of Hospitalizations", title = "Monthly Forecasts and Truth Data") +
   theme(legend.position = "none")
 
-ggplotly(p, tooltip = "text", height = 300 * length(param_table$id), width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
 ```
 
 #### Small States
@@ -675,7 +670,7 @@ forecast_subset <- params$forecasts %>%
   pivot_wider(names_from = "quantile", values_from = "prediction") %>%
   mutate(ahead = as.numeric(target_end_date - forecast_date))
 
-p <- ggplot(
+ggplot(
   data = forecast_subset,
   aes(x = target_end_date, group = forecast_date)
 ) +
@@ -697,7 +692,4 @@ p <- ggplot(
   facet_grid(factor(forecaster, levels = param_table$id) ~ geo_value, scales = "free") +
   labs(x = "Reference Date", y = "Weekly Sums of Hospitalizations", title = "Monthly Forecasts and Truth Data") +
   theme(legend.position = "none")
-
-ggplotly(p, tooltip = "text", height = 300 * length(param_table$id), width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
 ```
diff --git a/scripts/reports/exploration_summary_2025.Rmd b/scripts/reports/exploration_summary_2025.md
similarity index 71%
rename from scripts/reports/exploration_summary_2025.Rmd
rename to scripts/reports/exploration_summary_2025.md
index 272db3b3..df8bf48b 100644
--- a/scripts/reports/exploration_summary_2025.Rmd
+++ b/scripts/reports/exploration_summary_2025.md
@@ -1,40 +1,7 @@
----
-title: "Exploration Summary 2024-2025"
-date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
-output:
-  html_document:
-    code_folding: hide
-    toc: True
-editor_options:
-  chunk_output_type: console
----
+<!-- Render this with pandoc -->
+<!-- pandoc scripts/reports/exploration_summary_2025.md -s -o reports/exploration_summary_2025.html --css reports/style.css --metadata pagetitle='Exploration Summary 2024-2025' -->
 
-```{css, echo=FALSE}
-body {
-  display: block;
-  max-width: 1280px !important;
-  margin-left: auto;
-  margin-right: auto;
-}
-
-body .main-container {
-  max-width: 1280px !important;
-  width: 1280px !important;
-}
-```
-
-$$\\[.4in]$$
-
-```{r echo=FALSE, warning=FALSE,message=FALSE}
-knitr::opts_chunk$set(
-  fig.align = "center",
-  message = FALSE,
-  warning = FALSE,
-  cache = FALSE
-)
-ggplot2::theme_set(ggplot2::theme_bw())
-source(here::here("R/load_all.R"))
-```
+# Exploration Summary 2024-2025
 
 In this document, we will summarize our findings from backtesting a large variety of forecasters on the 2023-2024 season.
 The forecaster family definitions can be found at the bottom of [this page](https://delphi-forecasting-reports.netlify.app/).
@@ -53,19 +20,19 @@ The forecaster family definitions can be found at the bottom of [this page](http
 - An ensemble of climatological and the linear trend model (we used this at the start of the season when we didn't trust the data to support a more complex model)
   - We were surprised to find that this was only 7 mean WIS points behind our best performing family.
 - For context, the gap between our best performing family and FluSight-baseline was only about 15 mean WIS points.
+- Surprisingly, AR forecasters with augmented data performed **worse** than those that did not.
+  However, AR forecasters with seasonal windows and augmented data performed better than AR forecasters with only seasonal windows.
 
 ### Covid
 
 [The best performing families](https://delphi-forecasting-reports.netlify.app/covid-overall-notebook) were:
 
-- AR with seasonal windows and the NSSP exogenous feature
-  - This forecaster was about 10 mean WIS points behind UMass-flusion, but on par with the FluSight-ensemble.
-- Surprisingly, the `climate_linear` model was not far behind our best performing family.
+- AR with seasonal windows and the NSSP exogenous feature.
+  - This forecaster outperformed the CDC ensemble by about 15 mean WIS points.
+- Surprisingly, the `climate_linear` model was only about 4 mean WIS points behind our best performing family.
   (`climate_linear` combines the `climate_*` models with the `linear` model using a special weighting scheme.
   See the [season summary](season_summary_2025.html) for more details.)
 
-
-
 ## Important Parameters
 
 - Forecasters that used a seasonal training window were substantially better than those that did not.

From c961f8d0903c1fca56419fe6dd05ba448072a9be Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 16:14:11 -0700
Subject: [PATCH 54/62] repo: renv

---
 renv.lock | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/renv.lock b/renv.lock
index bde6fe4c..d1f65851 100644
--- a/renv.lock
+++ b/renv.lock
@@ -9,6 +9,12 @@
     ]
   },
   "Packages": {
+    "AsioHeaders": {
+      "Package": "AsioHeaders",
+      "Version": "1.30.2-1",
+      "Source": "Repository",
+      "Repository": "RSPM"
+    },
     "BH": {
       "Package": "BH",
       "Version": "1.87.0-1",
@@ -4077,6 +4083,12 @@
       "Maintainer": "Kirill Müller <krlmlr+r@mailbox.org>",
       "Repository": "RSPM"
     },
+    "hexbin": {
+      "Package": "hexbin",
+      "Version": "1.28.5",
+      "Source": "Repository",
+      "Repository": "RSPM"
+    },
     "highr": {
       "Package": "highr",
       "Version": "0.11",
@@ -4218,6 +4230,17 @@
       "Maintainer": "Carson Sievert <carson@posit.co>",
       "Repository": "CRAN"
     },
+    "httpgd": {
+      "Package": "httpgd",
+      "Version": "2.0.4",
+      "Source": "GitHub",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteUsername": "nx10",
+      "RemoteRepo": "httpgd",
+      "RemoteRef": "master",
+      "RemoteSha": "dd6ed3a687a2d7327bb28ca46725a0a203eb2a19"
+    },
     "httpuv": {
       "Package": "httpuv",
       "Version": "1.6.15",
@@ -9220,6 +9243,12 @@
       "Maintainer": "Davis Vaughan <davis@posit.co>",
       "Repository": "RSPM"
     },
+    "unigd": {
+      "Package": "unigd",
+      "Version": "0.1.3",
+      "Source": "Repository",
+      "Repository": "RSPM"
+    },
     "urlchecker": {
       "Package": "urlchecker",
       "Version": "1.0.1",

From fa6bbab5489ad857fc90017053c8d67c2e447ce1 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 16:17:32 -0700
Subject: [PATCH 55/62] lint: remove priority target args, as they're
 deprecated

---
 scripts/flu_hosp_prod.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 3dd7a57c..ebceba52 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -381,8 +381,7 @@ ensemble_targets <- tar_map(
       } else {
         cli_alert_info("Not making climate submission csv because we're in backtest mode or submission directory is cache")
       }
-    },
-    priority = 0.99
+    }
   ),
   tar_target(
     name = validate_result,

From e065f8dd55e4a9388c5d7d9ff1669ab0e2472d91 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 16:26:47 -0700
Subject: [PATCH 56/62] repo: renv

---
 renv.lock | 120 +-----------------------------------------------------
 1 file changed, 2 insertions(+), 118 deletions(-)

diff --git a/renv.lock b/renv.lock
index d1f65851..4789cb91 100644
--- a/renv.lock
+++ b/renv.lock
@@ -1955,50 +1955,8 @@
     },
     "crew": {
       "Package": "crew",
-      "Version": "1.0.0",
+      "Version": "1.1.0",
       "Source": "Repository",
-      "Title": "A Distributed Worker Launcher Framework",
-      "Description": "In computationally demanding analysis projects, statisticians and data scientists asynchronously deploy long-running tasks to distributed systems, ranging from traditional clusters to cloud services. The 'NNG'-powered 'mirai' R package by Gao (2023) <doi:10.5281/zenodo.7912722> is a sleek and sophisticated scheduler that efficiently processes these intense workloads. The 'crew' package extends 'mirai' with a unifying interface for third-party worker launchers. Inspiration also comes from packages. 'future' by Bengtsson (2021) <doi:10.32614/RJ-2021-048>, 'rrq' by FitzJohn and Ashton (2023) <https://github.com/mrc-ide/rrq>, 'clustermq' by Schubert (2019) <doi:10.1093/bioinformatics/btz284>), and 'batchtools' by Lang, Bischel, and Surmann (2017) <doi:10.21105/joss.00135>.",
-      "License": "MIT + file LICENSE",
-      "URL": "https://wlandau.github.io/crew/, https://github.com/wlandau/crew",
-      "BugReports": "https://github.com/wlandau/crew/issues",
-      "Authors@R": "c( person( given = c(\"William\", \"Michael\"), family = \"Landau\", role = c(\"aut\", \"cre\"), email = \"will.landau.oss@gmail.com\", comment = c(ORCID = \"0000-0003-1878-3253\") ), person( given = \"Daniel\", family = \"Woodie\", role = \"ctb\" ), person( family = \"Eli Lilly and Company\", role = c(\"cph\", \"fnd\") ))",
-      "Depends": [
-        "R (>= 4.0.0)"
-      ],
-      "Imports": [
-        "cli (>= 3.1.0)",
-        "data.table",
-        "getip",
-        "later",
-        "mirai (>= 2.0.1)",
-        "nanonext (>= 1.4.0)",
-        "processx",
-        "promises",
-        "ps",
-        "R6",
-        "rlang",
-        "stats",
-        "tibble",
-        "tidyselect",
-        "tools",
-        "utils"
-      ],
-      "Suggests": [
-        "autometric (>= 0.1.0)",
-        "knitr (>= 1.30)",
-        "markdown (>= 1.1)",
-        "rmarkdown (>= 2.4)",
-        "testthat (>= 3.0.0)"
-      ],
-      "Encoding": "UTF-8",
-      "Language": "en-US",
-      "VignetteBuilder": "knitr",
-      "Config/testthat/edition": "3",
-      "RoxygenNote": "7.3.2",
-      "NeedsCompilation": "no",
-      "Author": "William Michael Landau [aut, cre] (<https://orcid.org/0000-0003-1878-3253>), Daniel Woodie [ctb], Eli Lilly and Company [cph, fnd]",
-      "Maintainer": "William Michael Landau <will.landau.oss@gmail.com>",
       "Repository": "RSPM"
     },
     "crosstalk": {
@@ -8673,82 +8631,8 @@
     },
     "targets": {
       "Package": "targets",
-      "Version": "1.10.1",
+      "Version": "1.11.1",
       "Source": "Repository",
-      "Title": "Dynamic Function-Oriented 'Make'-Like Declarative Pipelines",
-      "Description": "Pipeline tools coordinate the pieces of computationally demanding analysis projects. The 'targets' package is a 'Make'-like pipeline tool for statistics and data science in R. The package skips costly runtime for tasks that are already up to date, orchestrates the necessary computation with implicit parallel computing, and abstracts files as R objects. If all the current output matches the current upstream code and data, then the whole pipeline is up to date, and the results are more trustworthy than otherwise. The methodology in this package borrows from GNU 'Make' (2015, ISBN:978-9881443519) and 'drake' (2018, <doi:10.21105/joss.00550>).",
-      "License": "MIT + file LICENSE",
-      "URL": "https://docs.ropensci.org/targets/, https://github.com/ropensci/targets",
-      "BugReports": "https://github.com/ropensci/targets/issues",
-      "Authors@R": "c( person( given = c(\"William\", \"Michael\"), family = \"Landau\", role = c(\"aut\", \"cre\"), email = \"will.landau.oss@gmail.com\", comment = c(ORCID = \"0000-0003-1878-3253\") ), person( given = c(\"Matthew\", \"T.\"), family = \"Warkentin\", role = \"ctb\" ), person( given = \"Mark\", family = \"Edmondson\", email = \"r@sunholo.com\", role = \"ctb\", comment = c(ORCID = \"0000-0002-8434-3881\") ), person( given = \"Samantha\", family = \"Oliver\", role = \"rev\", comment = c(ORCID = \"0000-0001-5668-1165\") ), person( given = \"Tristan\", family = \"Mahr\", role = \"rev\", comment = c(ORCID = \"0000-0002-8890-5116\") ), person( family = \"Eli Lilly and Company\", role = c(\"cph\", \"fnd\") ))",
-      "Depends": [
-        "R (>= 3.5.0)"
-      ],
-      "Imports": [
-        "base64url (>= 1.4)",
-        "callr (>= 3.7.0)",
-        "cli (>= 2.0.2)",
-        "codetools (>= 0.2.16)",
-        "data.table (>= 1.12.8)",
-        "igraph (>= 2.0.0)",
-        "knitr (>= 1.34)",
-        "ps (>= 1.8.0)",
-        "R6 (>= 2.4.1)",
-        "rlang (>= 1.0.0)",
-        "secretbase (>= 0.5.0)",
-        "stats",
-        "tibble (>= 3.0.1)",
-        "tidyselect (>= 1.1.0)",
-        "tools",
-        "utils",
-        "vctrs (>= 0.2.4)",
-        "yaml (>= 2.2.1)"
-      ],
-      "Suggests": [
-        "autometric (>= 0.1.0)",
-        "bslib",
-        "clustermq (>= 0.9.2)",
-        "crew (>= 0.9.0)",
-        "curl (>= 4.3)",
-        "DT (>= 0.14)",
-        "dplyr (>= 1.0.0)",
-        "fst (>= 0.9.2)",
-        "future (>= 1.19.1)",
-        "future.batchtools (>= 0.9.0)",
-        "future.callr (>= 0.6.0)",
-        "gargle (>= 1.2.0)",
-        "googleCloudStorageR (>= 0.7.0)",
-        "gt (>= 0.2.2)",
-        "keras (>= 2.2.5.0)",
-        "markdown (>= 1.1)",
-        "nanonext (>= 0.12.0)",
-        "rmarkdown (>= 2.4)",
-        "parallelly (>= 1.35.0)",
-        "paws.common (>= 0.6.4)",
-        "paws.storage (>= 0.4.0)",
-        "pkgload (>= 1.1.0)",
-        "processx (>= 3.4.3)",
-        "qs2",
-        "reprex (>= 2.0.0)",
-        "rstudioapi (>= 0.11)",
-        "R.utils (>= 2.6.0)",
-        "shiny (>= 1.5.0)",
-        "shinybusy (>= 0.2.2)",
-        "shinyWidgets (>= 0.5.4)",
-        "tarchetypes",
-        "testthat (>= 3.0.0)",
-        "torch (>= 0.1.0)",
-        "usethis (>= 1.6.3)",
-        "visNetwork (>= 2.1.2)"
-      ],
-      "Encoding": "UTF-8",
-      "Language": "en-US",
-      "VignetteBuilder": "knitr",
-      "Config/testthat/edition": "3",
-      "RoxygenNote": "7.3.2",
-      "NeedsCompilation": "no",
-      "Author": "William Michael Landau [aut, cre] (<https://orcid.org/0000-0003-1878-3253>), Matthew T. Warkentin [ctb], Mark Edmondson [ctb] (<https://orcid.org/0000-0002-8434-3881>), Samantha Oliver [rev] (<https://orcid.org/0000-0001-5668-1165>), Tristan Mahr [rev] (<https://orcid.org/0000-0002-8890-5116>), Eli Lilly and Company [cph, fnd]",
-      "Maintainer": "William Michael Landau <will.landau.oss@gmail.com>",
       "Repository": "RSPM"
     },
     "testthat": {

From 3386797931327bfc031264298fa9822bed8490fb Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 18:19:57 -0700
Subject: [PATCH 57/62] fix: a very complicated way to fix a bug

---
 R/forecasters/data_validation.R               | 14 --------
 R/forecasters/ensemble_average.R              |  2 +-
 R/forecasters/forecaster_climatological.R     |  5 ++-
 R/forecasters/forecaster_flatline.R           |  5 ++-
 R/forecasters/forecaster_flusion.R            |  5 ++-
 R/forecasters/forecaster_no_recent_outcome.R  |  5 ++-
 R/forecasters/forecaster_scaled_pop.R         |  5 ++-
 .../forecaster_scaled_pop_seasonal.R          |  8 ++---
 R/forecasters/forecaster_smoothed_scaled.R    |  5 ++-
 R/targets/covid_forecaster_config.R           | 22 ++++++------
 R/targets/flu_forecaster_config.R             | 36 +++++--------------
 R/utils.R                                     |  2 +-
 scripts/covid_hosp_explore.R                  |  4 +--
 scripts/flu_hosp_explore.R                    |  4 +--
 scripts/one_offs/forecaster_profiling.R       |  2 +-
 15 files changed, 40 insertions(+), 84 deletions(-)

diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
index 40fce850..2d9f610b 100644
--- a/R/forecasters/data_validation.R
+++ b/R/forecasters/data_validation.R
@@ -106,17 +106,3 @@ filter_minus_one_ahead <- function(epi_data, ahead) {
   }
   epi_data
 }
-
-#' Unwrap an argument if it's a list of length 1
-#'
-#' Many of our arguments to the forecasters come as lists not because we expect
-#' them that way, but as a byproduct of tibble and expand_grid.
-unwrap_argument <- function(arg, default_trigger = "", default = character(0L)) {
-  if (is.list(arg) && length(arg) == 1) {
-    arg <- arg[[1]]
-  }
-  if (identical(arg, default_trigger)) {
-    return(default)
-  }
-  return(arg)
-}
diff --git a/R/forecasters/ensemble_average.R b/R/forecasters/ensemble_average.R
index 81a79cd8..e3e9826b 100644
--- a/R/forecasters/ensemble_average.R
+++ b/R/forecasters/ensemble_average.R
@@ -25,7 +25,7 @@
 ensemble_average <- function(epi_data,
                              forecasts,
                              outcome,
-                             extra_sources = "",
+                             extra_sources = character(),
                              ensemble_args = list(),
                              ensemble_args_names = NULL) {
   # unique parameters must be buried in ensemble_args so that the generic function signature is stable
diff --git a/R/forecasters/forecaster_climatological.R b/R/forecasters/forecaster_climatological.R
index 9dec800e..d0eb9ccc 100644
--- a/R/forecasters/forecaster_climatological.R
+++ b/R/forecasters/forecaster_climatological.R
@@ -2,7 +2,7 @@
 #'
 climate_linear_ensembled <- function(epi_data,
                                      outcome,
-                                     extra_sources = "",
+                                     extra_sources = character(),
                                      ahead = 7,
                                      trainer = parsnip::linear_reg(),
                                      quantile_levels = covidhub_probs(),
@@ -22,8 +22,7 @@ climate_linear_ensembled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   args_list <- list(...)
   ahead <- as.integer(ahead / 7)
diff --git a/R/forecasters/forecaster_flatline.R b/R/forecasters/forecaster_flatline.R
index ad4dea77..081f165c 100644
--- a/R/forecasters/forecaster_flatline.R
+++ b/R/forecasters/forecaster_flatline.R
@@ -10,7 +10,7 @@
 #' @export
 flatline_fc <- function(epi_data,
                         outcome,
-                        extra_sources = "",
+                        extra_sources = character(),
                         ahead = 1,
                         trainer = parsnip::linear_reg(),
                         quantile_levels = covidhub_probs(),
@@ -18,8 +18,7 @@ flatline_fc <- function(epi_data,
                         filter_agg_level = "",
                         ...) {
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
diff --git a/R/forecasters/forecaster_flusion.R b/R/forecasters/forecaster_flusion.R
index 61246efd..ad6e415c 100644
--- a/R/forecasters/forecaster_flusion.R
+++ b/R/forecasters/forecaster_flusion.R
@@ -1,6 +1,6 @@
 flusion <- function(epi_data,
                     outcome,
-                    extra_sources = "",
+                    extra_sources = character(),
                     ahead = 7,
                     pop_scaling = FALSE,
                     trainer = rand_forest(
@@ -24,8 +24,7 @@ flusion <- function(epi_data,
   derivative_estimator <- arg_match(derivative_estimator)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   args_input <- list(...)
diff --git a/R/forecasters/forecaster_no_recent_outcome.R b/R/forecasters/forecaster_no_recent_outcome.R
index 4227b67c..15b7fb82 100644
--- a/R/forecasters/forecaster_no_recent_outcome.R
+++ b/R/forecasters/forecaster_no_recent_outcome.R
@@ -2,7 +2,7 @@
 #' it may whiten any old data as the outcome
 no_recent_outcome <- function(epi_data,
                               outcome,
-                              extra_sources = "",
+                              extra_sources = character(),
                               ahead = 7,
                               pop_scaling = FALSE,
                               trainer = epipredict::quantile_reg(),
@@ -24,8 +24,7 @@ no_recent_outcome <- function(epi_data,
   week_method <- arg_match(week_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # this is for the case where there are multiple sources in the same column
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
diff --git a/R/forecasters/forecaster_scaled_pop.R b/R/forecasters/forecaster_scaled_pop.R
index 2b07a02a..7d64a5e6 100644
--- a/R/forecasters/forecaster_scaled_pop.R
+++ b/R/forecasters/forecaster_scaled_pop.R
@@ -47,7 +47,7 @@
 #' @export
 scaled_pop <- function(epi_data,
                        outcome,
-                       extra_sources = "",
+                       extra_sources = character(),
                        ahead = 1,
                        pop_scaling = TRUE,
                        drop_non_seasons = FALSE,
@@ -64,8 +64,7 @@ scaled_pop <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   #
diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
index 5091de92..92c85766 100644
--- a/R/forecasters/forecaster_scaled_pop_seasonal.R
+++ b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -38,7 +38,7 @@
 scaled_pop_seasonal <- function(
   epi_data,
   outcome,
-  extra_sources = "",
+  extra_sources = character(),
   ahead = 1,
   pop_scaling = TRUE,
   drop_non_seasons = FALSE,
@@ -61,12 +61,8 @@ scaled_pop_seasonal <- function(
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
-  if (typeof(seasonal_method) == "list") {
-    seasonal_method <- seasonal_method[[1]]
-  }
   if (all(seasonal_method == c("none", "flu", "covid", "indicator", "window", "climatological"))) {
     seasonal_method <- "none"
   }
diff --git a/R/forecasters/forecaster_smoothed_scaled.R b/R/forecasters/forecaster_smoothed_scaled.R
index 16765d67..13066577 100644
--- a/R/forecasters/forecaster_smoothed_scaled.R
+++ b/R/forecasters/forecaster_smoothed_scaled.R
@@ -51,7 +51,7 @@
 #' @export
 smoothed_scaled <- function(epi_data,
                             outcome,
-                            extra_sources = "",
+                            extra_sources = character(),
                             ahead = 1,
                             pop_scaling = TRUE,
                             trainer = parsnip::linear_reg(),
@@ -73,8 +73,7 @@ smoothed_scaled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   #
diff --git a/R/targets/covid_forecaster_config.R b/R/targets/covid_forecaster_config.R
index e746ae71..29b2a8de 100644
--- a/R/targets/covid_forecaster_config.R
+++ b/R/targets/covid_forecaster_config.R
@@ -7,6 +7,10 @@
 #'
 #' Variables with 'g_' prefix are globals defined in the calling script.
 #'
+#' Note that expand_grid has some quirks:
+#' - if an entry is a vector c() or a list(), each top-level element is expanded out to a row.
+#' - this means that list(list()) reuses the same inner list for each row.
+#'
 #' @param dummy_mode Boolean indicating whether to use dummy forecasters
 #' @return A list of forecaster parameter combinations
 #' @export
@@ -32,7 +36,6 @@ get_covid_forecaster_params <- function() {
       expand_grid(
         forecaster = "scaled_pop",
         trainer = "quantreg",
-        # since it's a list, this gets expanded out to a single one in each row
         extra_sources = list2("nssp", "google_symptoms", "nwss", "nwss_region", "va_covid_per_100k"),
         lags = list2(
           list2(
@@ -105,18 +108,18 @@ get_covid_forecaster_params <- function() {
     scaled_pop_season = tidyr::expand_grid(
       forecaster = "scaled_pop_seasonal",
       trainer = "quantreg",
-      lags = list(
+      lags = list2(
         c(0, 7, 14, 21),
         c(0, 7)
       ),
       pop_scaling = FALSE,
       n_training = Inf,
-      seasonal_method = list(
-        c("covid"),
-        c("window"),
-        c("covid", "window"),
-        c("climatological"),
-        c("climatological", "window")
+      seasonal_method = list2(
+        list2("covid"),
+        list2("window"),
+        list2("covid", "window"),
+        list2("climatological"),
+        list2("climatological", "window")
       )
     ),
     climate_linear = bind_rows(
@@ -165,9 +168,6 @@ get_covid_forecaster_params <- function() {
         x$forecaster <- "dummy_forecaster"
       }
       x <- add_id(x)
-      if ("trainer" %in% names(x) && is.list(x$trainer)) {
-        x$trainer <- x$trainer[[1]]
-      }
       # Add the outcome to each forecaster.
       x$outcome <- "hhs"
       x
diff --git a/R/targets/flu_forecaster_config.R b/R/targets/flu_forecaster_config.R
index a62cf1a4..e4c987f5 100644
--- a/R/targets/flu_forecaster_config.R
+++ b/R/targets/flu_forecaster_config.R
@@ -209,31 +209,16 @@ get_flu_forecaster_params <- function() {
       tidyr::expand_grid(
         forecaster = "scaled_pop_seasonal",
         trainer = "quantreg",
-        lags = list2(
-          c(0, 7)
+        lags = list2(c(0, 7)),
+        seasonal_method = list2(
+          list2("window"),
+          list2("window", "flu"),
+          list2("window", "climatological")
         ),
-        seasonal_method = list("flu", "indicator", "climatological"),
-        pop_scaling = FALSE,
-        train_residual = c(TRUE, FALSE),
-        filter_source = c("", "nhsn"),
-        filter_agg_level = "state",
-        drop_non_seasons = c(TRUE, FALSE),
-        n_training = Inf,
-        keys_to_ignore = g_very_latent_locations
-      ),
-      # Window-based seasonal method shouldn't drop non-seasons
-      tidyr::expand_grid(
-        forecaster = "scaled_pop_seasonal",
-        trainer = "quantreg",
-        lags = list(
-          c(0, 7)
-        ),
-        seasonal_method = list("window", c("window", "flu"), c("window", "climatological")),
         pop_scaling = FALSE,
         train_residual = c(FALSE, TRUE),
         filter_source = c("", "nhsn"),
         filter_agg_level = "state",
-        drop_non_seasons = FALSE,
         n_training = Inf,
         keys_to_ignore = g_very_latent_locations
       )
@@ -250,7 +235,7 @@ get_flu_forecaster_params <- function() {
             c(0, 7) # exogenous feature
           )
         ),
-        seasonal_method = "window",
+        seasonal_method = list2("window"),
         pop_scaling = FALSE,
         filter_source = c("", "nhsn"),
         filter_agg_level = "state",
@@ -262,10 +247,8 @@ get_flu_forecaster_params <- function() {
     season_window_sizes = tidyr::expand_grid(
       forecaster = "scaled_pop_seasonal",
       trainer = "quantreg",
-      lags = list(
-        c(0, 7)
-      ),
-      seasonal_method = "window",
+      lags = list2(c(0, 7)),
+      seasonal_method = list2("window"),
       pop_scaling = FALSE,
       train_residual = FALSE,
       filter_source = c("", "nhsn"),
@@ -325,9 +308,6 @@ get_flu_forecaster_params <- function() {
         x$forecaster <- "dummy_forecaster"
       }
       x <- add_id(x)
-      if ("trainer" %in% names(x) && is.list(x$trainer)) {
-        x$trainer <- x$trainer[[1]]
-      }
       # Add the outcome to each forecaster.
       x$outcome <- "hhs"
       x
diff --git a/R/utils.R b/R/utils.R
index a73c3690..9c25dda4 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -30,7 +30,7 @@ forecaster_lookup <- function(pattern, forecaster_params_grid = NULL) {
 
   out <- forecaster_params_grid %>% filter(grepl(pattern, .data$id))
   if (nrow(out) > 0) {
-    out %>% glimpse()
+    out %>% unlist()
     return(out)
   }
 }
diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R
index cc87f7d8..baa4f20d 100644
--- a/scripts/covid_hosp_explore.R
+++ b/scripts/covid_hosp_explore.R
@@ -29,9 +29,9 @@ g_reports_dir <- "reports"
 g_fetch_args <- epidatr::fetch_args_list(return_empty = FALSE, timeout_seconds = 400)
 # Geos with insufficient data for forecasting.
 g_insufficient_data_geos <- c("as", "pr", "vi", "gu", "mp")
-# Human-readable object to be used for inspecting the forecasters in the pipeline.
+# Parameters object used for grouping forecasters by family.
 g_forecaster_parameter_combinations <- get_covid_forecaster_params()
-# Targets-readable object to be used for running the pipeline.
+# Targets-readable object used for running the pipeline.
 g_forecaster_params_grid <- g_forecaster_parameter_combinations %>%
   imap(\(x, i) make_forecaster_grid(x, i)) %>%
   bind_rows()
diff --git a/scripts/flu_hosp_explore.R b/scripts/flu_hosp_explore.R
index 0b97b897..0daa2af0 100644
--- a/scripts/flu_hosp_explore.R
+++ b/scripts/flu_hosp_explore.R
@@ -34,9 +34,9 @@ g_very_latent_locations <- list(list(
   c("source"),
   c("flusurv", "ILI+")
 ))
-# Human-readable object to be used for inspecting the forecasters in the pipeline.
+# Parameters object used for grouping forecasters by family.
 g_forecaster_parameter_combinations <- get_flu_forecaster_params()
-# Targets-readable object to be used for running the pipeline.
+# Targets-readable object used for running the pipeline.
 g_forecaster_params_grid <- g_forecaster_parameter_combinations %>%
   imap(\(x, i) make_forecaster_grid(x, i)) %>%
   bind_rows()
diff --git a/scripts/one_offs/forecaster_profiling.R b/scripts/one_offs/forecaster_profiling.R
index 5f9588dc..b0d96961 100644
--- a/scripts/one_offs/forecaster_profiling.R
+++ b/scripts/one_offs/forecaster_profiling.R
@@ -11,7 +11,7 @@ p <- profvis::profvis({
     epi_archive = d,
     outcome = "hhs",
     ahead = 2,
-    extra_sources = "",
+    extra_sources = character(),
     forecaster = scaled_pop,
     n_training_pad = 30L,
     forecaster_args = list(

From 2c0768bfc69a7e21fed57e9125b212342c10bf08 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Thu, 8 May 2025 18:21:50 -0700
Subject: [PATCH 58/62] lint: minor idiom tweak

---
 R/forecasters/forecaster_scaled_pop_seasonal.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
index 92c85766..6b820fac 100644
--- a/R/forecasters/forecaster_scaled_pop_seasonal.R
+++ b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -63,7 +63,7 @@ scaled_pop_seasonal <- function(
   epi_data <- validate_epi_data(epi_data)
   extra_sources <- unlist(extra_sources)
 
-  if (all(seasonal_method == c("none", "flu", "covid", "indicator", "window", "climatological"))) {
+  if (identical(seasonal_method, c("none", "flu", "covid", "indicator", "window", "climatological"))) {
     seasonal_method <- "none"
   }
   # perform any preprocessing not supported by epipredict

From bf55134d2de25560092082230883aba71239c6e2 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Fri, 9 May 2025 20:08:46 -0700
Subject: [PATCH 59/62] feat: simplify daily_to_weekly_archive * dont use
 epi_slide, group instead * it's faster and simpler

---
 Makefile                        | 12 +++---
 R/aux_data_utils.R              | 71 +++++++--------------------------
 R/forecasters/data_validation.R |  3 --
 3 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/Makefile b/Makefile
index 6796122c..58e455f0 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,12 @@ test:
 run:
 	Rscript scripts/run.R
 
+run-nohup:
+	nohup Rscript scripts/run.R &
+
+run-nohup-restarting:
+	scripts/hardRestarting.sh &
+
 prod-covid:
 	export TAR_RUN_PROJECT=covid_hosp_prod; Rscript scripts/run.R
 
@@ -65,12 +71,6 @@ get-nwss:
 	python nwss_covid_export.py; \
 	python nwss_influenza_export.py
 
-run-nohup:
-	nohup Rscript scripts/run.R &
-
-run-nohup-restarting:
-	scripts/hardRestarting.sh &
-
 sync:
 	Rscript -e "source('R/sync_aws.R'); sync_aws()"
 
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
index 91137677..3038339e 100644
--- a/R/aux_data_utils.R
+++ b/R/aux_data_utils.R
@@ -213,13 +213,15 @@ daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), keys = "geo_v
 #' @param epi_arch the archive to aggregate.
 #' @param agg_columns the columns to aggregate.
 #' @param agg_method the method to use to aggregate the data, one of "sum" or "mean".
-#' @param day_of_week the day of the week to use as the reference day.
-#' @param day_of_week_end the day of the week to use as the end of the week.
+#' @param week_reference the day of the week to use as the reference day (Wednesday is default).
+#'   Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
+#' @param week_start the day of the week to use as the start of the week (Sunday is default).
+#'   Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
 daily_to_weekly_archive <- function(epi_arch,
                                     agg_columns,
                                     agg_method = c("sum", "mean"),
-                                    day_of_week = 4L,
-                                    day_of_week_end = 7L) {
+                                    week_reference = 4L,
+                                    week_start = 7L) {
   # How to aggregate the windowed data.
   agg_method <- arg_match(agg_method)
   # The columns we will later group by when aggregating.
@@ -230,67 +232,24 @@ daily_to_weekly_archive <- function(epi_arch,
     sort()
   # Choose a fast function to use to slide and aggregate.
   if (agg_method == "sum") {
-    slide_fun <- epi_slide_sum
+    # If the week is complete, this is equivalent to the sum. If the week is not
+    # complete, this is equivalent to 7/(number of days in the week) * the sum,
+    # which should be a decent approximation.
+    agg_fun <- \(x) 7 * mean(x, na.rm = TRUE)
   } else if (agg_method == "mean") {
-    slide_fun <- epi_slide_mean
+    agg_fun <- \(x) mean(x, na.rm = TRUE)
   }
   # Slide over the versions and aggregate.
   epix_slide(
     epi_arch,
     .versions = ref_time_values,
     function(x, group_keys, ref_time) {
-      # The last day of the week we will slide over.
-      ref_time_last_week_end <- floor_date(ref_time, "week", day_of_week_end - 1)
-
-      # To find the days we will slide over, we need to find the first and last
-      # complete weeks of data. Get the max and min times, and then find the
-      # first and last complete weeks of data.
-      min_time <- min(x$time_value)
-      max_time <- max(x$time_value)
-
-      # Let's determine if the min and max times are in the same week.
-      ceil_min_time <- ceiling_date(min_time, "week", week_start = day_of_week_end - 1)
-      ceil_max_time <- ceiling_date(max_time, "week", week_start = day_of_week_end - 1)
-
-      # If they're not in the same week, this means we have at least one
-      # complete week of data to slide over.
-      if (ceil_min_time < ceil_max_time) {
-        valid_slide_days <- seq.Date(
-          from = ceiling_date(min_time, "week", week_start = day_of_week_end - 1),
-          to = floor_date(max_time, "week", week_start = day_of_week_end - 1),
-          by = 7L
-        )
-      } else {
-        # This is the degenerate case, where we have about 1 week or less of
-        # data. In this case, we opt to return nothing for two reasons:
-        # 1. in most cases here, the data is incomplete for a single week,
-        # 2. if the data is complete, a single week of data is not enough to
-        #    reasonably perform any kind of aggregation.
-        return(tibble())
-      }
-
-      # If the last day of the week is not the end of the week, add it to the
-      # list of valid slide days (this will produce an incomplete slide, but
-      # that's fine for us, since it should only be 1 day, historically.)
-      if (wday(max_time) != day_of_week_end) {
-        valid_slide_days <- c(valid_slide_days, max_time)
-      }
-
       # Slide over the days and aggregate.
       x %>%
-        group_by(across(all_of(keys))) %>%
-        slide_fun(
-          agg_columns,
-          .window_size = 7L,
-          na.rm = TRUE,
-          .ref_time_values = valid_slide_days
-        ) %>%
-        select(-all_of(agg_columns)) %>%
-        rename_with(~ gsub("slide_value_", "", .x)) %>%
-        rename_with(~ gsub("_7dsum", "", .x)) %>%
-        # Round all dates to reference day of the week. These will get
-        # de-duplicated by compactify in as_epi_archive below.
-        mutate(time_value = round_date(time_value, "week", day_of_week - 1)) %>%
+        mutate(week_start = ceiling_date(time_value, "week", week_start = week_start)-1) %>%
+        summarize(across(all_of(agg_columns), agg_fun), .by = all_of(c(keys, "week_start"))) %>%
+        mutate(time_value = round_date(week_start, "week", week_reference - 1)) %>%
+        select(-week_start) %>%
         as_tibble()
     }
   ) %>%
diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
index 2d9f610b..0ecc8d20 100644
--- a/R/forecasters/data_validation.R
+++ b/R/forecasters/data_validation.R
@@ -68,9 +68,6 @@ confirm_sufficient_data <- function(epi_data, ahead, args_input, outcome, extra_
   # TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
   # this needs to be fixed in epipredict itself, see
   # https://github.com/cmu-delphi/epipredict/issues/106.
-  if (identical(extra_sources, "")) {
-    extra_sources <- character(0L)
-  }
   has_no_last_nas <- epi_data %>%
     drop_na(c(!!outcome, !!!extra_sources)) %>%
     group_by(geo_value) %>%

From 8ba796ec31ebc3ef84fb08678e42b4ab9d86b20e Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Sat, 10 May 2025 02:09:44 -0700
Subject: [PATCH 60/62] enh: add summary reports to makefile

---
 Makefile                  | 3 +++
 scripts/summary_reports.R | 5 +++++
 2 files changed, 8 insertions(+)
 create mode 100644 scripts/summary_reports.R

diff --git a/Makefile b/Makefile
index 58e455f0..d6cfa8c7 100644
--- a/Makefile
+++ b/Makefile
@@ -98,3 +98,6 @@ get-flu-prod-errors:
 
 get-covid-prod-errors:
 	Rscript -e "suppressPackageStartupMessages(source(here::here('R', 'load_all.R'))); get_targets_errors(project = 'covid_hosp_prod')"
+
+summary_reports:
+	Rscript scripts/summary_reports.R
\ No newline at end of file
diff --git a/scripts/summary_reports.R b/scripts/summary_reports.R
new file mode 100644
index 00000000..cea8f9ab
--- /dev/null
+++ b/scripts/summary_reports.R
@@ -0,0 +1,5 @@
+rmarkdown::render("scripts/reports/revision_summary_report_2025.Rmd", output_file = here::here("reports", "revision_summary_2025.html"))
+rmarkdown::render("scripts/reports/decreasing_forecasters.Rmd", output_file = here::here("reports", "decreasing_forecasters.html"))
+rmarkdown::render("scripts/reports/season_summary_2025.Rmd", output_file = here::here("reports", "season_summary_2025.html"))
+rmarkdown::render("scripts/reports/first_day_wrong.Rmd", output_file = here::here("reports", "first_day_wrong.html"))
+system("pandoc scripts/reports/exploration_summary_2025.md -s -o reports/exploration_summary_2025.html --css reports/style.css --metadata pagetitle='Exploration Summary 2024-2025'")

From e9c5f3b4cf52ff066e06c0e836b52cd96ef9f77e Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Sat, 10 May 2025 02:24:55 -0700
Subject: [PATCH 61/62] fix: css

---
 R/utils.R                 | 2 +-
 scripts/summary_reports.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 9c25dda4..2b10d550 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -402,7 +402,7 @@ update_site <- function(sync_to_s3 = TRUE) {
 
   # Convert the markdown file to HTML
   system(
-    "pandoc reports/report.md -s -o reports/index.html --css=reports/style.css --mathjax='https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js' --metadata pagetitle='Delphi Reports'"
+    "pandoc reports/report.md -s -o reports/index.html --css=style.css --mathjax='https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js' --metadata pagetitle='Delphi Reports'"
   )
 }
 
diff --git a/scripts/summary_reports.R b/scripts/summary_reports.R
index cea8f9ab..44299350 100644
--- a/scripts/summary_reports.R
+++ b/scripts/summary_reports.R
@@ -2,4 +2,4 @@ rmarkdown::render("scripts/reports/revision_summary_report_2025.Rmd", output_fil
 rmarkdown::render("scripts/reports/decreasing_forecasters.Rmd", output_file = here::here("reports", "decreasing_forecasters.html"))
 rmarkdown::render("scripts/reports/season_summary_2025.Rmd", output_file = here::here("reports", "season_summary_2025.html"))
 rmarkdown::render("scripts/reports/first_day_wrong.Rmd", output_file = here::here("reports", "first_day_wrong.html"))
-system("pandoc scripts/reports/exploration_summary_2025.md -s -o reports/exploration_summary_2025.html --css reports/style.css --metadata pagetitle='Exploration Summary 2024-2025'")
+system("pandoc scripts/reports/exploration_summary_2025.md -s -o reports/exploration_summary_2025.html --css style.css --metadata pagetitle='Exploration Summary 2024-2025'")

From a8b70b002abaab3ea294b2370eaeca750990afec Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Sat, 10 May 2025 02:27:50 -0700
Subject: [PATCH 62/62] f

---
 reports/template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reports/template.md b/reports/template.md
index 6302801b..71b903cd 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -22,7 +22,7 @@
 
 ## 2023-2024 Season Backtesting
 
-- [Forecaster Exploration Summary](exploration_summary_2024.html)
+- [Forecaster Exploration Summary](exploration_summary_2025.html)
 - Flu
   - [Flu Overall](flu-overall-notebook.html)
   - [Flu AR](flu-notebook-scaled_pop_main.html)