diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
index 9cbd7bf1..40fce850 100644
--- a/R/forecasters/data_validation.R
+++ b/R/forecasters/data_validation.R
@@ -97,6 +97,16 @@ filter_extraneous <- function(epi_data, filter_source, filter_agg_level) {
   return(epi_data)
 }
 
+#' the minus one ahead causes problems for `quantile_regression` if that data is
+#' actually present, so we should filter it out
+filter_minus_one_ahead <- function(epi_data, ahead) {
+  if (ahead < 0) {
+    dont_include <- attr(epi_data, "metadata")$as_of + ahead
+    epi_data %<>% filter(time_value < dont_include)
+  }
+  epi_data
+}
+
 #' Unwrap an argument if it's a list of length 1
 #'
 #' Many of our arguments to the forecasters come as lists not because we expect
diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
index 7d96bd76..5091de92 100644
--- a/R/forecasters/forecaster_scaled_pop_seasonal.R
+++ b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -76,6 +76,8 @@ scaled_pop_seasonal <- function(
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
   # this is a temp fix until a real fix gets put into epipredict
   epi_data <- clear_lastminute_nas(epi_data, cols = c(outcome, extra_sources))
+  # predicting the -1 ahead when it is present sometimes lead to freezeing
+  epi_data %<>% filter_minus_one_ahead(ahead)
   # this next part is basically unavoidable boilerplate you'll want to copy
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
index ce42c0bb..f8aed36d 100644
--- a/R/forecasters/formatters.R
+++ b/R/forecasters/formatters.R
@@ -72,16 +72,15 @@ format_flusight <- function(pred, disease = c("flu", "covid")) {
 }
 
 format_scoring_utils <- function(forecasts_and_ensembles, disease = c("flu", "covid")) {
-  forecasts_and_ensembles %>%
-    filter(!grepl("region.*", geo_value)) %>%
-    mutate(
-      reference_date = get_forecast_reference_date(forecast_date),
-      target = glue::glue("wk inc {disease} hosp"),
-      horizon = as.integer(floor((target_end_date - reference_date) / 7)),
-      output_type = "quantile",
-      output_type_id = quantile,
-      value = value
-    ) %>%
+  # dplyr here was unreasonably slow on 1m+ rows, so replacing with direct access
+  fc_ens <- forecasts_and_ensembles
+  fc_ens <- fc_ens[!grepl("region.*", forecasts_and_ensembles$geo_value), ]
+  fc_ens[, "reference_date"] <- get_forecast_reference_date(fc_ens$forecast_date)
+  fc_ens[, "target"] <- glue::glue("wk inc {disease} hosp")
+  fc_ens[, "horizon"] <- as.integer(floor((fc_ens$target_end_date - fc_ens$reference_date) / 7))
+  fc_ens[, "output_type"] <- "quantile"
+  fc_ens[, "output_type_id"] <- fc_ens$quantile
+  fc_ens %>%
     left_join(
       get_population_data() %>%
         select(state_id, state_code),
@@ -89,7 +88,11 @@ format_scoring_utils <- function(forecasts_and_ensembles, disease = c("flu", "co
     ) %>%
     rename(location = state_code, model_id = forecaster) %>%
     select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value, model_id) %>%
-    drop_na()
+    drop_na() %>%
+    arrange(location, target_end_date, reference_date, output_type_id) %>%
+    group_by(model_id, location, target_end_date, reference_date) %>%
+    mutate(value = sort(value)) %>%
+    ungroup()
 }
 
 #' The quantile levels used by the covidhub repository
diff --git a/R/targets/score_targets.R b/R/targets/score_targets.R
index afa47c95..a0648ed6 100644
--- a/R/targets/score_targets.R
+++ b/R/targets/score_targets.R
@@ -13,7 +13,7 @@ get_external_forecasts <- function(external_object_name) {
     select(forecaster, geo_value, forecast_date, target_end_date, quantile, value)
 }
 
-score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles) {
+score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles, disease) {
   truth_data <-
     nhsn_latest_data %>%
     select(geo_value, target_end_date = time_value, oracle_value = value) %>%
@@ -33,9 +33,8 @@ score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles) {
     pull(max_forecast) %>%
     min()
   forecasts_formatted <-
-    joined_forecasts_and_ensembles %>%
-    filter(forecast_date <= max_forecast_date) %>%
-    format_scoring_utils(disease = "covid")
+    joined_forecasts_and_ensembles[joined_forecasts_and_ensembles$forecast_date <= max_forecast_date,] %>%
+    format_scoring_utils(disease = disease)
   scores <- forecasts_formatted %>%
     filter(location %nin% c("US", "60", "66", "78")) %>%
     hubEvals::score_model_out(
diff --git a/R/utils.R b/R/utils.R
index 45fe863c..a73c3690 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -392,39 +392,9 @@ update_site <- function(sync_to_s3 = TRUE) {
     )
 
     # Insert into Production Reports section, skipping a line
-    prod_reports_index <- which(grepl("## Production Reports", report_md_content)) + 1
+    prod_reports_index <- which(grepl("## Weekly Fanplots 2024-2025 Season", report_md_content)) + 1
     report_md_content <- append(report_md_content, report_link, after = prod_reports_index)
   }
-  # add scoring notebooks if they exist
-  score_files <- dir_ls(reports_dir, regexp = ".*_backtesting_2024_2025_on_.*.html")
-  if (length(score_files) > 0) {
-    # a tibble of all score files, along with their generation date and disease
-    score_table <- tibble(
-      filename = score_files,
-      dates = str_match_all(filename, "[0-9]{4}-..-..")
-    ) %>%
-      unnest_wider(dates, names_sep = "_") %>%
-      rename(generation_date = dates_1) %>%
-      mutate(
-        generation_date = ymd(generation_date),
-        disease = str_match(filename, "flu|covid")
-      )
-    used_files <- score_table %>%
-      group_by(disease) %>%
-      slice_max(generation_date)
-    # iterating over the diseases
-    for (row_num in seq_along(used_files$filename)) {
-      file_name <- path_file(used_files$filename[[row_num]])
-      scoring_index <- which(grepl("### Scoring this season", report_md_content)) + 1
-      score_link <- sprintf(
-        "- [%s Scoring, Rendered %s](%s)",
-        str_to_title(used_files$disease[[row_num]]),
-        used_files$generation_date[[row_num]],
-        file_name
-      )
-      report_md_content <- append(report_md_content, score_link, after = scoring_index)
-    }
-  }
 
   # Write the updated content to report.md
   report_md_path <- path(reports_dir, "report.md")
@@ -725,3 +695,50 @@ get_socrata_updated_at <- function(dataset_url, missing_value = MAX_TIMESTAMP) {
     }
   )
 }
+
+
+
+#' get the unique shared (geo_value, forecast_date, target_end_date) tuples present for each forecaster in `forecasts`
+get_unique <- function(forecasts) {
+  forecasters <- forecasts %>%
+    pull(forecaster) %>%
+    unique()
+  distinct <- map(
+    forecasters,
+    \(x) forecasts %>%
+      filter(forecaster == x) %>%
+      select(geo_value, forecast_date, target_end_date) %>%
+      distinct()
+  )
+  distinct_dates <- reduce(distinct, \(x, y) x %>% inner_join(y, by = c("geo_value", "forecast_date", "target_end_date")))
+  mutate(
+    distinct_dates,
+    forecast_date = round_date(forecast_date, unit = "week", week_start = 6)
+  )
+}
+
+#' filter the external and local forecasts to just the shared dates/geos
+#' some forecasters have a limited set of geos; we want to include those
+#' anyways, they are `tructated_forecasters`, while the external_forecasts may
+#' have previous years forecasts that we definitely want to exclude via
+#' `season_start`.
+filter_shared_geo_dates <- function(local_forecasts, external_forecasts, season_start = "2024-11-01", trucated_forecasters = "windowed_seasonal_extra_sources") {
+  viable_dates <- inner_join(
+    local_forecasts %>%
+      filter(forecaster %nin% trucated_forecasters) %>%
+      get_unique(),
+    external_forecasts %>%
+      filter(forecast_date > season_start) %>%
+      get_unique(),
+    by = c("geo_value", "forecast_date", "target_end_date")
+  )
+  dplyr::bind_rows(
+    local_forecasts %>%
+      mutate(
+        forecast_date = round_date(forecast_date, unit = "week", week_start = 6)
+      ) %>%
+      inner_join(viable_dates, by = c("geo_value", "forecast_date", "target_end_date")),
+    external_forecasts %>%
+      inner_join(viable_dates, by = c("geo_value", "forecast_date", "target_end_date"))
+  )
+}
diff --git a/covid_geo_exclusions.csv b/covid_geo_exclusions.csv
index 575422c2..3a7fb91d 100644
--- a/covid_geo_exclusions.csv
+++ b/covid_geo_exclusions.csv
@@ -141,14 +141,14 @@ forecast_date,forecaster,geo_value,weight
 ##################
 # feb 12
 ##################
-2025-02-05, all, mp, 0
-2025-02-05, windowed_seasonal, all, 3
-2025-02-05, windowed_seasonal_extra_sources, all, 0.0
-2025-02-05, climate_linear, all, 3
-2025-02-05, linear, all, 0.5
-2025-02-05, linearlog, all, 0
-2025-02-05, climate_base, all, 0
-2025-02-05, climate_geo_agged, all, 0.0
+2025-02-12, all, mp, 0
+2025-02-12, windowed_seasonal, all, 3
+2025-02-12, windowed_seasonal_extra_sources, all, 0.0
+2025-02-12, climate_linear, all, 3
+2025-02-12, linear, all, 0.5
+2025-02-12, linearlog, all, 0
+2025-02-12, climate_base, all, 0
+2025-02-12, climate_geo_agged, all, 0.0
 ##################
 # feb 5
 ##################
diff --git a/reports/template.md b/reports/template.md
index 6c99ef24..9034a60f 100644
--- a/reports/template.md
+++ b/reports/template.md
@@ -1,161 +1,202 @@
-# Forecast Reports
+<style>
+/* Some basic styling (a reasonable reading width and dark mode support) */
+body {
+  max-width: 800px;
+  margin: 2rem auto;
+  padding: 0 1rem;
+  font-family: sans-serif;
+  background: white;
+  color: black;
+}
+
+a:link {
+  color: blue;
+}
+
+a:visited {
+  color: purple;
+}
+
+/* Dark mode support */
+@media (prefers-color-scheme: dark) {
+  body {
+    background: #121212;
+    color: #e0e0e0;
+  }
+
+  a:link {
+    color: #80cbc4;
+  }
+
+  a:visited {
+    color: #b39ddb; /* light purple for contrast on dark bg */
+  }
+}
+</style>
+
+# Delphi Forecast Reports
 
 [GitHub Repo](https://github.com/cmu-delphi/explorationt-tooling/)
 
-## Production Reports
+## Overview
 
+- The weekly fanplots were used by the team for visual inspections of the forecasts.
+- The season reports provide a general analysis of the season's data and forecasts performance.
+- The backtesting reports provide a detailed analysis of a wide variety of forecasters' performance on the previous season's data.
+- A description of the forecaster families explored is provided at the bottom of the page.
 
-### Scoring this season
+## Weekly Fanplots 2024-2025 Season
 
 
-## Exploration Reports
+## 2024-2025 Season Reports
 
+- [Season Summary](season_summary_2025.html) (the notebooks below are linked from here)
+  - [Covid's Problematic Initial Forecast](first_day_wrong.html)
+  - [NHSN Revision Behavior](revision_summary_2025.html)
 - [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
-### Flu
+## Backtesting on 2023-2024 Season
+
+- [Exploration Summary](exploration_summary_2024.html)
+- Flu
+  - All forecasters population scale their data, use geo pooling, and train using quantreg.
+  - These definitions are in the `flu_forecaster_config.R` file.
+  - [Flu Overall](flu-overall-notebook.html)
+  - [Flu AR](flu-notebook-scaled_pop_main.html)
+  - [Flu AR with augmented data](flu-notebook-scaled_pop_data_augmented.html)
+  - [Flu AR with exogenous features](flu-notebook-scaled_pop_exogenous.html)
+  - [Flu AR with different seasonal schemes](flu-notebook-scaled_pop_season.html)
+  - [Flu AR with augmented data and with different seasonal window sizes](flu-notebook-season_window_sizes.html)
+  - [Flu AR with augmented data, exogenous features, and seasonal windowing](flu-notebook-scaled_pop_season_exogenous.html)
+  - Simplistic/low data methods:
+    - [Flu no recent](flu-notebook-no_recent_quant.html)
+    - [Flu no recent](flu-notebook-no_recent_quant.html)
+    - [Flu flatline](flu-notebook-flatline.html)
+    - [Flu climate](flu-notebook-climate_linear.html)
+- Covid
+  - All forecasters population scale their data, use geo pooling, and train using quantreg.
+  - These definitions are in the `covid_forecaster_config.R` file.
+  - [Covid AR](covid-notebook-scaled_pop_main.html)
+  - [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
+  - [Covid AR with exogenous features](covid-notebook-scaled_pop_exogenous.html)
+  - [Covid Flatline](covid-notebook-flatline_forecaster.html)
+  - Simplistic/low data methods:
+    - [Covid no recent](covid-notebook-no_recent_quant.html)
+    - [Covid flatline](covid-notebook-flatline.html)
+    - [Covid climate](covid-notebook-climate_linear.html)
+
+## Description of Forecaster Families
+
+The main forecaster families were:
+
+- Autoregressive models (AR)
+  - with seasonal features
+  - with exogenous features
+  - with augmented data
+- Climatological
+- Linear trend
+- No recent outcome
+- Flatline
+
+All the AR models had the option of population scaling, seasonal features, exogenous features, and augmented data.
+We tried all possible combinations of these features.
+All models had the option of using the `linreg`, `quantreg`, or `grf` engine.
+We found that `quantreg` gave better results than `linreg` and we had computational issues with `grf`, so we used `quantreg` the rest of the time.
+
+### Autoregressive models (AR)
 
-All forecasters population scale their data, use geo pooling, and train using quantreg.
-These definitions are in the `flu_forecaster_config.R` file.
-
-- [Flu Overall](flu-overall-notebook.html)
-- [Flu AR](flu-notebook-scaled_pop_main.html)
-- [Flu AR with augmented data](flu-notebook-scaled_pop_data_augmented.html)
-- [Flu AR with exogenous features](flu-notebook-scaled_pop_exogenous.html)
-- [Flu AR with different seasonal schemes](flu-notebook-scaled_pop_season.html)
-- [Flu AR with augmented data and with different seasonal window sizes](flu-notebook-season_window_sizes.html)
-- [Flu AR with augmented data, exogenous features, and seasonal windowing](flu-notebook-scaled_pop_season_exogenous.html)
-
-Simplistic/low data methods:
-
-- [Flu no recent](flu-notebook-no_recent_quant.html)
-- [Flu flatline](flu-notebook-flatline.html)
-- [Flu climate](flu-notebook-climate_linear.html)
-
-### Covid
-
-All forecasters population scale their data, use geo pooling, and train using quantreg.
-These definitions are in the `covid_forecaster_config.R` file.
-
-- [Covid AR](covid-notebook-scaled_pop_main.html)
-- [Covid AR with seasonal features](covid-notebook-scaled_pop_season.html)
-- [Covid AR with exogenous features](covid-notebook-scaled_pop_exogenous.html)
-- [Covid Flatline](covid-notebook-flatline_forecaster.html)
-
-Simplistic/low data methods:
-
-- [Covid no recent](covid-notebook-no_recent_quant.html)
-- [Covid flatline](covid-notebook-flatline.html)
-- [Covid climate](covid-notebook-climate_linear.html)
-
-## Descriptions of Forecaster Families
-
-### Training Data Information
-
-(Taken from [David's Org File](https://github.com/cmu-delphi/exploration-tooling/blob/5a6da8d0d0202da6d79a5ee8e702d4654364ce46/forecasters_description.org#flusion).)
-
-Some use just NHSN, while others use historical data from ILI+ and Flusurv+ as
-additional rows in training. ILI+ and Flusurv+ have been adjusted so that the
-total for the season matches NHSN’s total. Flusurv is taken from epidata, but
-ILI+ was constructed by Evan and given to Richard. The testing date range is
-roughly the 2023 season, so October 2023 through late April 2024.
-
-### Flu exogenous features
+Internal name: `scaled_pop`.
 
-- NSSP
-  Note that this data set is possibly cheating, as we don't have revisions before April of this year, so it is using the latest data.
-  If we narrow down to `time_value`s after that, the revision behavior is
+A simple autoregressive model, which predicts using
 
-  ```
-  Min lag (time to first version):
-      min median     mean     max
-  7 days 7 days 7.7 days 14 days
-  Fraction of epi_key+time_values with
-  No revisions:
-  • 362 out of 954 (37.95%)
-  Quick revisions (last revision within 3 days of the `time_value`):
-  • 0 out of 954 (0%)
-  Few revisions (At most 3 revisions for that `time_value`):
-  • 946 out of 954 (99.16%)
+$$x_{t+k} = ar(x)$$
 
-  Fraction of revised epi_key+time_values which have:
-  Less than 0.1 spread in relative value:
-  • 329 out of 592 (55.57%)
-  Spread of more than 0.1015 in actual value (when revised):
-  • 18 out of 592 (3.04%)
-  days until within 20% of the latest value:
-      min median   mean     max
-  7 days 7 days 9 days 70 days
-  ```
+where $x$ is the target variable and $ar(x)$ is a linear combination of the target variable's past values, which can be scaled according to each state's population or whitened according to another scheme (or both). In practice, we found that using lags (0, 7) was quite effective (with (0, 7, 14) and (0, 7, 14, 21) providing no discernible advantage), so we focused on those, so in practice our model was
 
-  So most days have some revisioning, but with fairly small total changes, with the vast majority of days being within 20% of their eventual value within a week (with some much longer exceptions, apparently).
-  So the impact of the cheating is likely small but of course hard to know.
+$$x_{t+k} = x_t + x_{t-7}$$
 
-- Google-Symptoms
-  This dataset doesn't have revisions, but has a history of suddenly disappearing.
-  The latest value was used to simulate actually having the data; at worst, it breaks down to being the underlying forecaster.
-- NWSS and NWSS_regional
-  The originating dataset has minimal revisions, but as this is a dataset with quite a lot of processing from the underlying that involves some amount of time travel, it is unclear how much revision behavior it effectively has.
+where $k \in \{0, 7, 14, 21, 28\}$ is the forecast horizon.
 
-### Data Whitening
+### Autoregressive models with seasonal features
 
-The data augmented models using ILI+ and FluSurv+ take a few different approaches to data whitening, depending on the `scale_method, center_method, nonlin_method` parameters.
+Internal name: `scaled_pop_seasonal`.
 
-TODO: Add descriptions.
+We tried a few different attempts at incorporating seasonal features:
 
-This is more closely in line with the [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn (using a much wider quantile than the default settings there).
+- The approach that performed the best was using a *seasonal training window* that grabbed a window of data (about 4 weeks before and ahead) around the forecast epiweek from the current and previous seasons.
+- Two *indicator variables* that roughly correspond to before, during, and after the typical peak (roughly, `before = season_week < 16`, `during = 16 <= season_week <= 20`, and `after = season_week > 20`).
+- Taking the first two *principal components* of the full whitened augmented data reshaped as `(epiweek, state_source_season_value)`.
+(We found that this was not particularly effective, so we did not use it.
+Despite spending a week debugging this, we could not rule out the possibility that it was a bug.
+However, we also had mixed results from tests of this feature in very simple synthetic data cases.)
+- We also tried using the *climatological median* of the target variable as a feature (see below for definition of "climatological").
+- Note that unusually, the last two features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
 
-## Overall comparison
+### Autoregressive models with exogenous features
 
-This takes the best mean WIS result from each of the forecaster families below, and puts them in the same notebook for inter-family comparison.
+Internal name: `scaled_pop_seasonal`.
 
-## Forecaster Families
+These models could opt into the same seasonal features as the `scaled_pop_seasonal` forecaster, but also included exogenous features.
 
-### AR with population scaling
+#### Flu exogenous features
 
-Internal name: `scaled_pop`.
+- NSSP - we don't have revisions before Spring 2024 for this data, so we used a revision analysis from the data collected after that date to estimate the lag (roughly 7 days) and used that lag to simulate delays.
+- Google-Symptoms - this dataset doesn't have revisions, but has a history of suddenly disappearing, resulting in intermittent long update lags.
+We did not simulate a lag and just used to latest value for a best case scenario.
+The symptom set used was s01, s03, and s04 from [here](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html).
+- NWSS - the originating dataset has minimal revisions, but as this is a dataset with quite a lot of processing from the underlying that involves some amount of time travel, so it is unclear how much revision behavior is present.
+- NWSS_regional - same as NWSS, just aggregated to the HHS region level.
 
-A simple model, which predicts using
+#### Covid exogenous features
 
-$$x_{t+k} = ar(x)$$
+- NSSP - same as flu.
+- Google-Symptoms - same as flu, though we used a slightly different symtom set (just s04 and s05 from [here](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html)).
 
-where $x$ is scaled according to each state’s population.
+### Autoregressive models with augmented data
 
-Three versions, two with different engines `quantreg` and `grf`, and the final one with augmented data.
+Internal name: `scaled_pop` (with `filter_source = ""`).
 
-### AR with population scaling and seasonal features
+This forecaster is still the standard autoregressive model, but with additional training data.
+Inspired by UMass-flusion, the additional training data consisted of historical data from ILI+ and Flusurv+, which was brought to a comprable level with NHSN and treated as additional observations of the target variable (hence the name "augmented data").
+Flusurv was taken from epidata, but ILI+ was constructed by Evan Ray and given to Richard (Berkeley Summer 2024 intern).
+Naturally, this forecaster was only used for flu, as the same data was not available for covid.
 
-Internal name: `scaled_pop_seasonal`.
+#### Scaling Parameters (Data Whitening)
 
-There are 2 seasonal features that we're trying here:
+The augmented data forecasters took a few different approaches to data whitening (akin to [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) from scikit-learn).
 
-1. taking the first 3 PC components from the whitened fused data (so nhsn, ILI+, and Flusurv). (Note that it's 2 for covid).
-2. 2 indicators that roughly correspond to before, during and after the typical peak (first is true when `season_week < 16`, the second is true when `season_week > 20`, and the peak is captured by the overall constant).
-   Note that unusually, these features are actually led rather than lagged, since we should be predicting using the target's coefficient, rather than the present one.
+- `scale_method`
+  - `quantile` - scales the data so that the difference between the 5th and 95th quantiles is 1
+  - `quantile_upper` - scales the data so that the 95th quantile is 1 (this was used by UMass-flusion)
+  - `std` - scales the data so that one standard deviation is 1
+  - `none` - no scaling
+  - We did not see a significant difference in changing the above parameter, so we used the default `quantile` the rest of the time.
+- `center_method`
+  - `median` - centers the data so that the median is 0
+  - `mean` - centers the data so that the mean is 0
+  - `none` - no centering
+  - We did not see a significant difference in changing the above parameter, so we used the default `median` the rest of the time.
+- `nonlin_method`
+  - `quart_root` - takes the 4th root of the data (and adds 0.01 to avoid negative values)
+  - `none` - no non-linear transformation
+  - Of these, `quart_root` gave us the best results, so we used that the rest of the time. There were occasional issues with the epsilon offset causing a positive value to become the floor as the inversion was taken.
 
-### Flusion-like
+### Climatological
 
-Roughly designed in line with the flusion model.
+This was our term for a forecaster that directly forecast a distribution built from similar weeks from previous seasons (in analogy with baseline weather forecasting).
+We found that in some cases it made a reasonable baseline, though when the current season's peak time was significatly different from the seasons in the training data, it was not particularly effective.
 
 ### No Recent Outcome
 
-This is the fall-back forecaster, in case we have no data, but are forced to make a prediction.
+This was a fall-back forecaster built for the scenario where NHSN data was not going to reported in time for the start of the forecasting challenge.
 
 A flusion-adjacent model pared down to handle the case of not having the target as a predictor.
 
-$$\bar{x}_{t+k} = f(t_{season}) + p + d + \big\langle y_{t-k}\big\rangle_{k=0:1} + \big\langle y_{t-k}\big\rangle_{t=0:3}$$
-
-where $y$ here is any exogenous variables; initially this will be empty, as nssp is missing some states, so we will have to rewrite these models to handle missing geos (most likely by having a separate model for the case when an exogenous variable is missing).
+$$\bar{x}_{t+k} = \big\langle y_{t-k}\big\rangle_{k=0:1} + \big\langle y_{t-k}\big\rangle_{t=0:3}$$
 
-$f$ is either the identity or 2 sine terms, defined so that the first has half a period during the season, and is zero after it, while the second is one period over the season, with zero after
+where $y$ here is any set of exogenous variables.
 
 ### Flatline
 
-This is what the FluSight-baseline is based on, so they should be identical. However, at the moment, this has scaling issues.
-
-# Covid Forecasts 2024-2025
-
-For now, just AR forecasters with source-pooled data. Forecaster descriptions
-are the same as above.
-
-TODO: Get lagged correlations notebook hosted.
+A simple "LOCF" forecaster that simply forecasts the last observed value and uses residuals to create a distributional forecast. This is what the FluSight-baseline is based on, so they should be identical.
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
index a8942b7a..52e2bf5c 100644
--- a/scripts/covid_hosp_prod.R
+++ b/scripts/covid_hosp_prod.R
@@ -35,7 +35,7 @@ if (!g_backtest_mode) {
   # override this. It should be a Wednesday.
   g_forecast_dates <- round_date(g_forecast_generation_dates, "weeks", week_start = 3)
 } else {
-  g_forecast_generation_dates <- c(as.Date(c("2024-11-22", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
+  g_forecast_generation_dates <- c(as.Date(c("2024-11-20", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
   g_forecast_dates <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 }
 
@@ -91,13 +91,14 @@ g_windowed_seasonal_extra_sources <- function(epi_data, ahead, extra_data, ...)
   fcst
 }
 g_forecaster_params_grid <- tibble(
-  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged"),
-  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged")),
+  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged", "seasonal_nssp_latest"),
+  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged", "g_windowed_seasonal_extra_sources")),
   params = list(
     list(),
     list(),
     list(),
     list(),
+    list(),
     list()
   ),
   param_names = list(
@@ -105,6 +106,7 @@ g_forecaster_params_grid <- tibble(
     list(),
     list(),
     list(),
+    list(),
     list()
   )
 )
@@ -175,20 +177,32 @@ forecast_targets <- tar_map(
   tar_target(
     name = forecast_res,
     command = {
-      nhsn_data <- nhsn_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end)) %>%
+      # if the forecaster is named latest, it should use the most up to date
+      # version of the data
+      if (grepl("latest", id)) {
+        nhsn_data <- nhsn_archive_data %>%
+          epix_as_of(nhsn_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(nssp_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+      } else {
+        nhsn_data <- nhsn_archive_data %>%
+        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      }
+      nhsn_data <- nhsn_data %>%
         add_season_info() %>%
         mutate(
           geo_value = ifelse(geo_value == "usa", "us", geo_value),
           time_value = time_value - 3
         ) %>%
-        data_substitutions(covid_data_substitutions, as.Date(forecast_generation_date_int)) %>%
         filter(geo_value %nin% g_insufficient_data_geos)
+      if (!grepl("latest", id)) {
+        nhsn_data %<>%
+          data_substitutions(covid_data_substitutions, as.Date(forecast_generation_date_int))
+      }
       attributes(nhsn_data)$metadata$as_of <- as.Date(forecast_date_int)
 
-      nssp_data <- nssp_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
-
       forecaster_fn <- get_partially_applied_forecaster(forecaster, aheads, params, param_names)
 
       forecaster_fn(nhsn_data, extra_data = nssp_data) %>%
@@ -429,13 +443,16 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        dplyr::bind_rows(!!!.x, external_forecasts)
+        filter_shared_geo_dates(
+          dplyr::bind_rows(!!!.x),
+          external_forecasts
+        )
       }
     ),
     tar_target(
       name = scores,
       command = {
-        score_forecasts(nhsn_latest_data, joined_forecasts_and_ensembles)
+        score_forecasts(nhsn_latest_data, joined_forecasts_and_ensembles, "covid")
       }
     ),
     tar_target(
diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
index 75ad7cbb..7149a29d 100644
--- a/scripts/flu_hosp_prod.R
+++ b/scripts/flu_hosp_prod.R
@@ -42,7 +42,7 @@ if (!g_backtest_mode) {
   # override this. It should be a Wednesday.
   g_forecast_dates <- round_date(g_forecast_generation_dates, "weeks", week_start = 3)
 } else {
-  g_forecast_generation_dates <- c(as.Date(c("2024-11-22", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
+  g_forecast_generation_dates <- c(as.Date(c("2024-11-21", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
   g_forecast_dates <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 }
 
@@ -103,13 +103,14 @@ g_windowed_seasonal_extra_sources <- function(epi_data, ahead, extra_data, ...)
   fcst
 }
 g_forecaster_params_grid <- tibble(
-  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged"),
-  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged")),
+  id = c("linear", "windowed_seasonal", "windowed_seasonal_extra_sources", "climate_base", "climate_geo_agged", "seasonal_nssp_latest"),
+  forecaster = rlang::syms(c("g_linear", "g_windowed_seasonal", "g_windowed_seasonal_extra_sources", "g_climate_base", "g_climate_geo_agged", "g_windowed_seasonal_extra_sources")),
   params = list(
     list(),
     list(),
     list(),
     list(),
+    list(),
     list()
   ),
   param_names = list(
@@ -117,6 +118,7 @@ g_forecaster_params_grid <- tibble(
     list(),
     list(),
     list(),
+    list(),
     list()
   )
 )
@@ -203,18 +205,27 @@ forecast_targets <- tar_map(
     full_data,
     command = {
       # Train data
+      if (grepl("latest", id)) {
+        train_data <- nhsn_archive_data %>%
+          epix_as_of(nhsn_archive_data$versions_end) %>% filter(time_value < as.Date(forecast_date_int))
+      } else {
       train_data <- nhsn_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end)) %>%
+        epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end))
+      }
+    train_data %<>%
         add_season_info() %>%
         mutate(
           geo_value = ifelse(geo_value == "usa", "us", geo_value),
           time_value = time_value - 3,
           source = "nhsn"
-        ) %>%
-        data_substitutions(
-          flu_data_substitutions,
-          as.Date(forecast_generation_date_int)
-        ) %>%
+        )
+    if (!grepl("latest", id)) {
+    train_data %<>% data_substitutions(
+                      flu_data_substitutions,
+                      as.Date(forecast_generation_date_int)
+                    )
+    }
+     train_data %<>%
         filter(geo_value %nin% g_insufficient_data_geos)
       attributes(train_data)$metadata$as_of <- as.Date(forecast_date_int)
 
@@ -228,8 +239,14 @@ forecast_targets <- tar_map(
   tar_target(
     name = forecast_res,
     command = {
-      nssp_data <- nssp_archive_data %>%
-        epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      if (grepl("latest", id)) {
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(nssp_archive_data$versions_end) %>%
+          filter(time_value < as.Date(forecast_date_int))
+      } else {
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      }
 
       forecaster_fn <- get_partially_applied_forecaster(forecaster, aheads, params, param_names)
 
@@ -470,18 +487,22 @@ if (g_backtest_mode) {
       name = joined_forecasts_and_ensembles,
       ensemble_targets[["forecasts_and_ensembles"]],
       command = {
-        dplyr::bind_rows(!!!.x, external_forecasts)
+        filter_shared_geo_dates(
+          dplyr::bind_rows(!!!.x),
+          external_forecasts
+        )
       }
     ),
     tar_target(
       name = scores,
       command = {
+        browser()
         nhsn_latest_end_of_week <-
           nhsn_latest_data %>%
           mutate(
-            time_value = ceiling_date(time_value, unit = "week", week_start = 6)
+            time_value = round_date(time_value, unit = "week", week_start = 6)
           )
-        score_forecasts(nhsn_latest_end_of_week, joined_forecasts_and_ensembles)
+        score_forecasts(nhsn_latest_end_of_week, joined_forecasts_and_ensembles, "flu")
       }
     ),
     tar_target(
diff --git a/scripts/get_forecast_data.r b/scripts/get_forecast_data.r
index 93a3308e..20178b84 100644
--- a/scripts/get_forecast_data.r
+++ b/scripts/get_forecast_data.r
@@ -150,10 +150,9 @@ fetch_forecast_files <- function(sync_to_s3 = TRUE, disease) {
 }
 
 cli::cli_alert_info("Fetching COVID forecasts {run_time_local} (UTC: {run_time})")
-fetch_forecast_files(disease = "covid")
+covid_forecasts <- fetch_forecast_files(disease = "covid")
 cli::cli_alert_info("Fetching FLU forecasts {run_time_local} (UTC: {run_time})")
-fetch_forecast_files(disease = "flu")
-
+flu_forecasts <- fetch_forecast_files(disease = "flu")
 print(glue::glue("Run successfully finished at {Sys.time()}"))
 print("-------------------------------------------------------")
 print("-------------------------------------------------------")
diff --git a/scripts/one_offs/read_covid_forecast_hub_data.jl b/scripts/one_offs/read_covid_forecast_hub_data.jl
index 327bd2a8..eb2c019c 100644
--- a/scripts/one_offs/read_covid_forecast_hub_data.jl
+++ b/scripts/one_offs/read_covid_forecast_hub_data.jl
@@ -1,9 +1,11 @@
 # this was run from within the https://github.com/reichlab/covid19-forecast-hub repo,
-# specifically in the data-processed folder
+# specifically in the model-output folder
+# cd("../../../covid19-forecast-hub/model-output")
+# if started here
 # to get the rds, run
 #
-# full_results <- readr::read_csv("../covid19-forecast-hub/data-processed/covid19-2023season-results.csv")
-# aws.s3::s3save(full_results, object = "covid19_forecast_hub_2023_full_summed.rds", bucket = "forecasting-team-data")
+# full_results <- readr::read_csv(here::here("cache/covid19-2024season-results.csv"))
+# aws.s3::s3save(full_results, object = "covid19_forecast_hub_2024_full.rds", bucket = "forecasting-team-data")
 #
 using Base: floatrange
 using CSV
@@ -13,13 +15,14 @@ using Dates
 using RData
 import Base.lowercase
 pwd()
-res = CSV.read("COVIDhub_CDC-ensemble/2023-10-02-COVIDhub_CDC-ensemble.csv", DataFrame)
-pathname = "COVIDhub_CDC-ensemble/"
-filename = "2023-10-02-COVIDhub_CDC-ensemble.csv"
-state_names = CSV.read("../data-locations/locations.csv", DataFrame)
+res = CSV.read("CovidHub-ensemble/2024-11-23-CovidHub-ensemble.csv", DataFrame)
+pathname = "CovidHub-ensemble"
+filename = "2024-11-23-CovidHub-ensemble.csv"
+state_names = CSV.read("../auxiliary-data/locations.csv", DataFrame)
 lowercase(m::Missing) = m
 @rtransform! state_names @passmissing :abbreviation = lowercase(:abbreviation)
 @select! state_names :abbreviation :location
+format_file(pathname, filename, state_names)
 function format_file(pathname, filename, state_names)
     if length(filename) < 10 ||
        match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", filename[1:10]) == nothing ||
@@ -28,23 +31,29 @@ function format_file(pathname, filename, state_names)
     end
     println(joinpath(pathname, filename))
     res = CSV.read(joinpath(pathname, filename), DataFrame, missingstring="NA", types=Dict("value" => Float64))
-    if !("forecast_date" in names(res)) ||
-       res[!, :forecast_date] |> minimum < Date(2023, 1, 1)
+    if "forecast_date" in names(res)
+        @rename! res :reference_date = :forecast_date
+    end
+    if !("reference_date" in names(res)) ||
+       (res[!, :reference_date] |> minimum) < Date(2023, 1, 1)
         return DataFrame()
     end
-    @transform(res, :target = (:target))
     res = @chain res begin
-        @rtransform :target = parse(Int64, match(r"[0-9]*", :target).match)
+        # old format problem, ahead is now recorded elsewhere
+        #@rtransform :target = parse(Int64, match(r"[0-9]*", :target).match)
         @transform :forecaster = pathname[3:end]
-        @rsubset :type == "quantile"
+        @rsubset :output_type == "quantile"
     end
     res = leftjoin(res, state_names, on=:location)
-    @select! res :forecaster :geo_value = :abbreviation :forecast_date :target_end_date :ahead = :target :quantile :value
-    @chain res begin
-        @rtransform :week_ahead = div(:ahead, 7)
-        @groupby :forecaster :geo_value :forecast_date :week_ahead :quantile
-        @combine :value = sum(:value)
-    end
+    names(res)
+    res[!, :output_type_id]
+    @select res :forecaster :geo_value = :abbreviation :forecast_date = :reference_date :target_end_date :ahead = :horizon :quantile = :output_type_id :value
+    # this is for converting daily forecasts into weekly, whereas this script is currently downloading weekly forecasts
+    #@chain res begin
+    #    @rtransform :week_ahead = div(:ahead, 7)
+    #    @groupby :forecaster :geo_value :reference_date :week_ahead :quantile
+    #    @combine :value = sum(:value)
+    #end
 end
 results = DataFrame[]
 for (root, dirs, files) in walkdir(".")
@@ -52,5 +61,7 @@ for (root, dirs, files) in walkdir(".")
         push!(results, format_file(root, file, state_names))
     end
 end
+maximum(size.(results, 2))
 full_results = vcat(results...)
-CSV.write("covid19-2023season-results.csv", full_results)
+CSV.write("../../exploration-tooling/cache/covid19-2024season-results.csv", full_results)
+pwd()
diff --git a/scripts/reports/first_day_wrong.Rmd b/scripts/reports/first_day_wrong.Rmd
new file mode 100644
index 00000000..6a5fa417
--- /dev/null
+++ b/scripts/reports/first_day_wrong.Rmd
@@ -0,0 +1,112 @@
+---
+title: "Why are the COVID 2024-2025 season scores so bad on the first day?"
+date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+output:
+  html_document:
+    code_folding: hide
+editor_options:
+  chunk_output_type: console
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+$$\\[.4in]$$
+
+```{r echo=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+knitr::opts_template$set(figure1 = list(fig.height = 4, fig.width=4))
+source(here::here("R/load_all.R"))
+```
+
+The scores on the first forecast day for COVID are all unusually bad, bad enough we initially thought it was a bug.
+This notebook is a demonstration that these forecasts are somewhat reasonable given the context, and separated out because it would otherwise be distracting.
+The primary reason is that it has an unusual amount of revision.
+In comparison, the flu season delayed their initial forecast by a day, which allowed a new data revision to be used, which explains why flu doesn't have this problem.
+
+# Revision Behavior
+
+First, getting the necessary archive, forecasts, and scores, and plotting the versions around the first forecast day `2024-11-20` (the week of `2024-11-23`):
+
+```{r, fig.height = 15, fig.width = 15}
+covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores"))
+covid_forecasts <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "joined_forecasts_and_ensembles"))
+covid_archive <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+text_size <- 6
+covid_archive$DT %>%
+  filter(time_value < as.Date("2024-11-23") + 4*7, time_value > "2024-09-01") %>%
+  as_epi_archive() %>%
+  autoplot(.versions = c(as.Date("2024-11-20"), covid_archive$versions_end)) +
+  geom_vline(aes(xintercept = as.Date("2024-11-23")))
+```
+
+Most locations have a significantly different version on `2024-11-20`, some by as much as 4 times the final version.
+
+```{r}
+# Building a function to plot the different forecasters
+plot_problem_day <- function(forecaster, text_size = 6) {
+  covid_scores %>% filter(forecast_date == "2024-11-23") %>% arrange(wis)
+covid_forecasts %>% filter(forecaster =="climate_base") %>% filter(forecast_date == "2024-11-23")
+  cmu_timeseries_fc <- covid_forecasts %>% filter(forecaster ==.env$forecaster) %>% filter(forecast_date == "2024-11-23")
+  cmu_timeseries_wide <- cmu_timeseries_fc %>%
+    pivot_wider(names_from = "quantile", values_from = "value")
+  covid_archive$DT %>% filter(time_value < as.Date("2024-11-23") + 4*7, time_value > "2024-09-01") %>% as_epi_archive() %>% autoplot() +
+    geom_vline(aes(xintercept = as.Date("2024-11-23"))) +
+    geom_ribbon(data = cmu_timeseries_wide, aes(x = target_end_date, ymin = `0.1`, ymax = `0.9`), alpha = 0.3) +
+    geom_ribbon(data = cmu_timeseries_wide, aes(x = target_end_date, ymin = `0.25`, ymax = `0.75`), alpha = 0.3) +
+    geom_line(data = cmu_timeseries_wide, aes(x = target_end_date, y = `0.5`)) +
+    facet_wrap(~geo_value, scale = "free")
+}
+```
+
+### Windowed seasonal
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("windowed_seasonal")
+```
+
+which is extrapolating out in a straight line from trends that are reporting artifacts.
+
+### Covidhub-baseline
+Covidhub baseline forms a good sanity check, since it is forecasting out from the versioned data (which explains why it is ~as  bad as `windowed_seasonal_extra_sources`)
+
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("CovidHub-baseline")
+```
+
+### Linear
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("linear")
+```
+
+which is extrapolating out in a straight line from trends that are reporting artifacts.
+
+### Climate
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("climate_base")
+```
+
+This one is wrong simply because this season was unusually low at this point.
+
+### CMU-TimeSeries
+This is a bit odd, since the forecaster we were using at the time was a simple average of the linear and climate forecasters, and so is off because of a combination of the reasons the linear and climate forecasts are off.
+
+```{r, fig.height = 15, fig.width = 15}
+plot_problem_day("CMU-TimeSeries")
+```
diff --git a/scripts/reports/revision_summary_report_2025.Rmd b/scripts/reports/revision_summary_report_2025.Rmd
new file mode 100644
index 00000000..6b3d0db2
--- /dev/null
+++ b/scripts/reports/revision_summary_report_2025.Rmd
@@ -0,0 +1,425 @@
+---
+title: "Revision Summary 2025"
+date: "Rendered: `r format(Sys.time(), '%Y-%m-%d %H:%M:%S')`"
+output:
+  html_document:
+    code_folding: hide
+    toc: True
+    # self_contained: False
+    # lib_dir: libs
+editor_options:
+  chunk_output_type: console
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+$$\\[.4in]$$
+
+```{r echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+source(here::here("R/load_all.R"))
+```
+
+# Overall takeaways
+
+There is substantial under-reporting behavior that is fairly consistent for a single geo.
+We can probably improve our forecasts by including revision behavior.
+Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
+We should look into the extra columns to see if it provides useful information for handling revision behavior.
+
+# Flu revision
+
+## NHSN
+
+### Revision behavior
+
+First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
+
+```{r}
+nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_flu$time_type <- "day"
+revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+```
+
+```{r}
+revision_summary %>% print(quick_revision = 7)
+```
+
+So around a fifth have no revisions, around a quarter resolve within a week, and around 2/3rds have a small number of revisions.
+Around half of those with revisions have little relative change (10% of the max value).
+The "actual value" change isn't really worth thinking about because this is counts data (so there being only 6 doesn't tell us much).
+
+Here's a plot of the version changes for all locations.
+
+```{r, out.width = "120%"}
+text_size <- 6
+nhsn_archive_flu %>% autoplot(value) + theme(strip.text.x = element_text(size = text_size, margin = margin(.1, 0, .1, 0, "cm")), axis.text = element_text(size =text_size, angle = 45), legend.title = element_text(size = text_size), legend.text = element_text(size = text_size), legend.key.size = unit(0.5, "cm")) + scale_size_manual(values = c(0.5))
+```
+
+Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
+Worst in this case meaning worst average relative spread
+
+```{r}
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+av_re_spread %>%
+  print(n=20)
+```
+
+The worst 9 excluding the geo's we don't actually forecast (so no `as` or `vi`, `gu`).
+
+```{r}
+nhsn_archive_flu %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+And the next 9 worse
+
+```{r}
+nhsn_archive_flu %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+
+These cover a range of revision behaviors; some are off on a single `time_value` in a terrible way, such as `mn` or `nh`, while most have fairly constant revisioning.
+`az` is unusual in having two bad `version`s when they revised the entire history down by a factor of 2, and then reverted to what it had been previously.
+Most are systematically reporting lower than the actual values, with some such as `nm`, `nh`, `ok`, and `ak` particularly bad about this.
+It is probably worth adding a measure at the key level of how systematic the bias is to `revision_analysis`, or perhaps a separate function.
+These seem likely to be estimable beforehand.
+Exceptions that include at least one case of over-reporting: `mn`, `id`, `nh`
+These are not later backfilled; they seem more like bad estimates or entry error.
+
+### Data substitions
+And we actually need to compare this with the data revision estimates:
+```{r}
+data_substitutions <- readr::read_csv(
+    here::here(glue::glue("flu_data_substitutions.csv")),
+    comment = "#",
+    show_col_types = FALSE
+  ) %>%
+  mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+nhsn_archive_flu %>%
+  autoplot(value, .facet_filter = geo_value %in% (data_substitutions$geo_value %>% unique())) + geom_point(data = data_substitutions, aes(x = time_value, y = value)) + facet_wrap(~geo_value, scale = "free")
+```
+
+which doesn't look all that great.
+To calculate how much closer (or further) we were from the final value, first we construct the relevant snapshots:
+```{r}
+final_values <- nhsn_archive_flu %>% epix_as_of_current() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+data_as_it_was <- map(
+  data_substitutions$forecast_date %>% unique(),
+  \(version) nhsn_archive_flu %>% epix_as_of(version) %>% mutate(forecast_date = version)
+) %>%
+  list_rbind() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+```
+
+and then we compute several notions of differences
+
+```{r}
+full_table <- data_substitutions %>%
+  left_join(
+    final_values,
+    by = join_by(geo_value, time_value), suffix = c("_substituted", "_final")
+  ) %>%
+  left_join(
+    data_as_it_was,
+    by = join_by(geo_value, forecast_date, time_value)
+  ) %>%
+  rename(as_of_value = value) %>%
+  mutate()
+
+diffs <- full_table %>%
+  mutate(
+    abs_diff = value_substituted - value_final,
+    rel_diff = abs_diff / value_final,
+    rel_rev_diff = abs_diff / (-value_final + as_of_value),
+    ) %>%
+  arrange(rel_diff)
+diffs %>%
+  select(-forecast_date, -value_substituted, -value_final, -as_of_value) %>%
+  print(n = 23)
+```
+
+The table is sorted by `rel_diff`.
+
+- `abs_diff` gives how much our adjustment was over by (so positive means we were over the latest value).
+
+- `rel_diff` gives the difference relative to the latest value; for `nh` on `03/29` (the last entry in the table), our estimate was nearly twice as large as the latest value.
+  This is mostly to compensate for the different magnitudes of values across time and geo.
+
+- `rel_rev_diff` is the most appropriate to view as a substitution scoring. It divides the `abs_diff` by how much the value as of the forecast date was off; for `nh` on `03/29` again, it was merely 73%, so we did bring it closer to the actual value. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
+
+How many did we substitute a more accurate value?
+
+```{r}
+mean(abs(diffs$rel_rev_diff) < 1)
+```
+
+around 35%, so not a great track record overall.
+How about lower than the target vs higher than the target?
+```{r}
+diffs %>%
+  mutate(is_over = rel_diff > 0) %>%
+  group_by(is_over) %>%
+  summarize(fraction_improved = mean(abs(rel_rev_diff) < 1))
+```
+
+So we did marginally better when it was below, but much worse when it was above.
+
+Overall, it turns out our value substitutions did not actually help much for flu.
+
+## NSSP
+
+```{r}
+nssp_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nssp_archive_data"))
+
+nssp_archive_flu <- nssp_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nssp_archive_flu$time_type <- "day"
+revision_summary_nssp <- nssp_archive_flu %>% epiprocess::revision_analysis(nssp, min_waiting_period = NULL)
+```
+
+```{r}
+revision_summary_nssp %>% print(quick_revision = 7)
+```
+
+So very few weeks have no revision, but ~70% have only a small number of revisions.
+And most (87%) are very close to their final values
+
+```{r}
+av_re_spread <- revision_summary_nssp$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+```
+
+
+```{r}
+nssp_archive_flu %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+```{r}
+nssp_archive_flu %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+These corrections are *way* more regular than nhsn, so we can definitely do backcasting to adjust the values.
+
+### Revision behavior
+### Correlation with latest
+
+Does NSSP actually correlate better with the latest value than nhsn itself does?
+This was a property we were relying on through the season to generate our corrections.
+
+# Covid revision
+And now for ~ the same idea, but for covid
+
+## NHSN
+First we get the archive and remove the data older than the first version so as not to clog up the revision behavior, and display the overall revision summary[^1].
+
+```{r}
+nhsn_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_covid <- nhsn_archive_covid$DT %>% filter(time_value >= "2024-11-19") %>%
+  filter(geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_covid$time_type <- "day"
+revision_summary <- nhsn_archive_covid %>%
+  epiprocess::revision_analysis(value, min_waiting_period = NULL)
+revision_summary %>% print(quick_revision = 7)
+```
+
+Around a fifth have no revisions, around a third resolve within a week, and around 3/4ths have a small number of revisions.
+Around 60% of those with revisions have little relative change (10% of the max value).
+The "actual value" change isn't really worth thinking about because this is counts data (so there being only 6 doesn't tell us much).
+
+Here's a plot of the version changes for all locations.
+
+```{r}
+nhsn_archive_covid %>% autoplot(value) + theme(strip.text.x = element_text(size = text_size, margin = margin(.1, 0, .1, 0, "cm")), axis.text = element_text(size =text_size, angle = 45), legend.title = element_text(size = text_size), legend.text = element_text(size = text_size), legend.key.size = unit(0.5, "cm")) + scale_size_manual(values = c(0.5))
+```
+
+Since this is probably too small to actually be legible, let's figure out the states with the worst revision behavior and plot those.
+Worst in this case meaning worst average relative spread
+
+```{r}
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+av_re_spread %>%
+  print(n=20)
+```
+
+Which, if you compare with the average relative spread in the [Flu section](##NHSN) is remarkably similar.
+The worst 9 excluding the geo's we don't actually forecast (so no `as` or `vi`, `gu`).
+
+```{r}
+nhsn_archive_covid %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+And the next 9 worse
+
+```{r}
+nhsn_archive_covid %>% autoplot(.facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+Strictly visually, this seems to revise more than the flu data (this doesn't actually fit well with the numeric revision analysis so something odd is going on).
+Perhaps the revisions are more chaotic.
+`mn` instead of having a single time point for a single version has it's entire trajectory wrong in the same way that `az` does, possibly on the same `version`.
+`az` has similar revision behavior (a couple of weeks where they were wildly off).
+
+`nh` has similar behavior in over-estimating a specific single time value quite badly, but otherwise having typical under-reporting problems.
+
+`dc` has some new and wild behavior; this is likely as visually striking as it is because the values are so small.
+
+`mo` has new revision behavior, primarily in the magnitude of the difference.
+
+`ok` has more extreme under-reporting than in the case of flu, but again this seems to likely be a factor of the number of cases, suggesting their underreporting happens in absolute number of cases rather than relative[^2].
+
+Like flu, these seem likely to be estimable beforehand.
+
+### Data substitutions
+And we actually need to compare revision behavior with our estimates of the correct values:
+```{r}
+data_substitutions <- readr::read_csv(
+    here::here(glue::glue("covid_data_substitutions.csv")),
+    comment = "#",
+    show_col_types = FALSE
+  ) %>%
+  mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+nhsn_archive_covid %>%
+  autoplot(value, .facet_filter = geo_value %in% (data_substitutions$geo_value %>% unique())) + geom_point(data = data_substitutions, aes(x = time_value, y = value)) + facet_wrap(~geo_value, scale = "free")
+```
+
+To calculate how much closer (or further) we were from the final value, first we construct the relevant snapshots:
+```{r}
+final_values <- nhsn_archive_covid %>% epix_as_of_current() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+data_as_it_was <- map(
+  data_substitutions$forecast_date %>% unique(),
+  \(version) nhsn_archive_covid %>% epix_as_of(version) %>% mutate(forecast_date = version)
+) %>%
+  list_rbind() %>% mutate(time_value = round_date(time_value, unit = "week", week_start = 6))
+```
+
+
+```{r}
+final_values %>% filter(time_value > "2025-01-03")
+full_table <- data_substitutions %>%
+  left_join(
+    final_values,
+    by = join_by(geo_value, time_value), suffix = c("_substituted", "_final")
+  ) %>%
+  left_join(
+    data_as_it_was,
+    by = join_by(geo_value, forecast_date, time_value)
+  ) %>%
+  rename(as_of_value = value) %>%
+  mutate()
+
+diffs <- full_table %>%
+  mutate(
+    abs_diff = value_substituted - value_final,
+    rel_diff = abs_diff / value_final,
+    rel_rev_diff = abs_diff / (-value_final + as_of_value),
+    ) %>%
+  arrange(rel_diff)
+diffs %>%
+  select(-forecast_date, -value_substituted, -value_final, -as_of_value) %>%
+  print(n = 23)
+```
+
+The table is sorted by `rel_diff`.
+- `abs_diff` gives how much our adjustment was over by (so positive means we were over the latest value).
+
+- `rel_diff` gives the difference relative to the latest value.
+
+- `rel_rev_diff` on the other hand divides that by how much the value as of the forecast date was off. Any of these which are <1 are "successful" in the sense that we were closer to the latest value than the as_of value was. An infinite value tells us that we adjusted a value that hasn't been corrected. The sign for `rel_rev_diff` is a bit confusing, and tells us whether our estimate and the as of value were both larger/smaller than the latest value, or one larger and one smaller.
+
+We did fewer substitutions in Covid, but they were more regularly good, and there were no cases where we shouldn't have changed the value and did.
+
+How many did we substitute a more accurate value?
+
+```{r}
+mean(abs(diffs$rel_rev_diff) < 1)
+```
+
+50%, so overall basically guessing.
+How about lower than the target vs higher than the target?
+```{r}
+diffs %>%
+  mutate(is_over = rel_diff > 0) %>%
+  group_by(is_over) %>%
+  summarize(fraction_improved = mean(abs(rel_rev_diff) < 1))
+```
+
+When we were under, we always improved the situation. However when we were above, sometimes we made it worse.
+
+
+
+## NSSP
+
+```{r}
+nssp_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nssp_archive_data"))
+
+nssp_archive_covid <- nssp_archive_covid$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nssp_archive_covid$time_type <- "day"
+revision_summary_nssp <- nssp_archive_covid %>% epiprocess::revision_analysis(nssp, min_waiting_period = NULL)
+```
+
+```{r}
+revision_summary_nssp %>% print(quick_revision = 7)
+```
+
+So few weeks have no revision (only 14%), but ~87% have only a small number of revisions.
+And most (81%) are close to their final values
+
+```{r}
+av_re_spread <- revision_summary_nssp$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+```
+
+
+```{r}
+nssp_archive_covid %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) + theme(strip.text.x = element_text(size = 8))
+```
+
+```{r}
+nssp_archive_covid %>% autoplot(nssp, .facet_filter = geo_value %in% av_re_spread$geo_value[10:18]) + theme(strip.text.x = element_text(size = 8))
+```
+
+A similar story to flu, these corrections are *way* more regular than nhsn, so we can definitely do backcasting to adjust the values.
+
+# Correlations between the two archives
+
+Visually, the flu and covid nhsn datasets seem to have related revision behavior.
+We should probably study this more carefully, but the question is a somewhat difficult one.
+One potential way to go about this is for each `(time_value, geo_value)` pair, do a correlation of the time series across `version`.
+Alteratively, we could do the same, but for the differences between versions (so correlate the correction amount).
+
+[^1]: `min_waiting_period` is `NULL` here since we're plotting mid-season, while `quick_revision = 7` because this is weekly data represented using days (because the versions are days).
+
+[^2]: which would definitely be a bit odd of an effect.
diff --git a/scripts/reports/season_summary_2025.Rmd b/scripts/reports/season_summary_2025.Rmd
new file mode 100644
index 00000000..968cf340
--- /dev/null
+++ b/scripts/reports/season_summary_2025.Rmd
@@ -0,0 +1,1064 @@
+---
+title: "Season Summary 2024-2025"
+date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
+output:
+  html_document:
+    code_folding: hide
+    toc: True
+editor_options:
+  chunk_output_type: console
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+$$\\[.4in]$$
+
+```{r echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(
+  fig.align = "center",
+  message = FALSE,
+  warning = FALSE,
+  cache = FALSE
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+source(here::here("R/load_all.R"))
+```
+
+```{r setup, include=FALSE}
+library(scales)
+library(DT)
+
+# Define aggregation functions
+Mean <- function(x) mean(x, na.rm = TRUE)
+GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
+our_forecasters <- c("linear", "windowed_seasonal", "windowed_seasonal_nssp", "climate_base", "climate_geo_agged", "climate_linear", "ensemble_windowed", "retro_submission", "CMU-TimeSeries", "seasonal_nssp_latest")
+flu_scores <-
+  qs2::qs_read(here::here("flu_hosp_prod", "objects", "scores")) %>%
+  mutate(forecaster = case_match(
+           forecaster,
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = forecaster
+    )) %>%
+  mutate(our_forecaster = forecaster %in% our_forecasters)
+flu_forecasts <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "joined_forecasts_and_ensembles"))
+flu_forecasts$forecaster %<>% case_match(
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = flu_forecasts$forecaster
+    )
+forecast_dates <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "forecast_dates"))
+covid_scores <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "scores")) %>%
+  mutate(forecaster = case_match(
+           forecaster,
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = forecaster
+    ))
+covid_forecasts <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "joined_forecasts_and_ensembles")) %>% ungroup()
+covid_forecasts$forecaster %<>% case_match(
+    "windowed_seasonal_extra_sources" ~ "windowed_seasonal_nssp",
+    "ensemble_linclim_windowed_seasonal" ~ "retro_submission",
+    "ens_ar_only" ~ "ensemble_windowed",
+    .default = covid_forecasts$forecaster
+    )
+
+forecast_week <- flu_scores$forecast_date %>% unique()
+forecast_weeks_to_plot <- c(seq.Date(min(forecast_week), max(forecast_week), by = 3*7), as.Date("2025-01-18"), as.Date("2025-02-01"))
+forecast_weeks_to_plot %in% (flu_scores$forecast_date %>% unique())
+forecast_weeks_to_plot %in% (covid_scores$forecast_date %>% unique())
+```
+
+# Models used
+One thing to note: all of these models filter out the 2020/21 and 2021/22 seasons.
+For both flu and covid these seasons are either unusually large or unusually small, and don't warrant inclusion.
+We can split the models and ensembles into 3 categories: the ad-hoc models that we created in response to the actual data that we saw, the AR models that we had been backtesting, and the ensembles.
+
+### The "ad-hoc" models
+
+- `climate_base` uses a 7 week window around the target and forecast date to establish quantiles.
+  `climate_base` does this separately for each geo.
+- `climate_geo_agged` on the other hand converts to rates, pools all geos, computes quantiles using similar time windows, and then converts back to counts.
+  There is effectively only one prediction, scaled to fit each geo.
+- `linear` does a linear extrapolation of the last 4 weeks of data on a rates scale.
+  Initially it had an intercept, but this was removed when it caused the model to not reproduce the -1 ahead data exactly.
+  This change was made on Jan 8th, in the commit with hash 5f7892b.
+  The quantiles are ad-hoc; the residuals are pooled, symmetrized, truncated using some bounds hand-selected to set the quantiles at a reasonable width, and then propagated forward using `propagate_samples` from epipredict.
+- `climate_linear` combines the `climate_*` models with the `linear` model.
+  It does two linear weightings between the linear model and the climate models.
+  As the ahead goes from -1 to 4, it linearly interpolates between a 5% weight on the climate model and a 90% weight on the climate model (so the furthest ahead is mostly a climate model).
+  At the same time, as the quantile level goes further away from the median, it interpolates between a 10% weight on the climate model at the median and a 100% weight on the climate model at either the 1% or 99% quantile levels.
+  In net, at the median -1 ahead, the climate models have a weight of 0.5%, and the linear model of 99.5%.
+
+<details>
+<summary> A plot of the `climate` weights </summary>
+
+```{r climate_weight_plot, fig.width = 12, fig.height = 3}
+weights <-
+  make_ahead_weights(-1:3) %>%
+  left_join(
+    make_quantile_weights(covidhub_probs()),
+    by = c("forecast_family"),
+    relationship = "many-to-many"
+  ) %>%
+  mutate(weight = weight.x * weight.y) %>%
+  select(forecast_family, quantile, ahead, weight)
+weights %>% filter(forecast_family == "climate") %>% ggplot(aes(x = factor(ahead), y = factor(quantile), fill = weight)) + geom_tile() + scale_fill_viridis_c(limits = c(0,1))
+```
+
+</details>
+
+### The AR models
+
+- `windowed_seasonal` is an AR forecaster using lags 0 and 7 that uses training data from an 8 week window from each year.
+  It does quartic root scaling along with quantile and median whitening.
+  In addition to dropping the first 2 seasons, the windowed models drop the summers for the purposes of determining whitening behavior.
+  For flu, this augments with ili and flusurv (so they are added as additional rows, with their own scaling/centering).
+  Covid doesn't have a comparable dataset.
+- `windowed_seasonal_nssp` is like `windowed_seasonal`, but also has `nssp` as an exogenous component.
+  Note that for flu, this effectively means throwing out the ili and flusurv data, since `nssp` is only defined recently.
+  For covid, `windowed_seasonal_nssp` is effectively the same model, but with auxiliary data.
+
+### The general ensembles
+
+- `ensemble_windowed` combines the `windowed_seasonal` and `windowed_seasonal_nssp` in a simple half and half ensemble.
+  One would expect this to be more helpful for Flu than Covid, since they have different information available.
+- `retro_submission` is a retroactive recreation of `CMU-TimeSeries` using updated methods (`linear` always matching the most recent value, for example).
+  The weights for the various models can be found in [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv).
+  These can vary on a state by state basis.
+- `CMU-TimeSeries` is what we actually submitted.
+  This is a moving target that has changed a number of times. For a detailed list of the weights used, see [`flu_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/flu_geo_exclusions.csv) or [`covid_geo_exclusions`](https://github.com/cmu-delphi/exploration-tooling/blob/main/covid_geo_exclusions.csv) for specific weights.
+
+  <details>
+  <summary> A timeline of the changes to `CMU-timeseries` </summary>
+  ```{r cmu_timeseries_timeline, echo=FALSE}
+  tribble(~Date, ~`Change for flu`, ~`Change for covid`,
+          as.Date("2024-11-21"), "Initial forecast. Uses a simple average of the `climate_base` and the `linear` models.", "Same model as Flu",
+          as.Date("2024-11-27"), "Start using the `climate_linear` model", "start using the `climate_linear` model",
+          as.Date("2024-12-04"), "-", "-",
+          as.Date("2024-12-11"), "Introduction of `windowed_seasonal` model", "model remains just `climate_linear`",
+          as.Date("2024-12-18"), "-", "-",
+          as.Date("2024-12-25"), "-", "-",
+          as.Date("2025-01-01"), "-", "-",
+          as.Date("2025-01-08"), "`linear` no longer has an intercept", "same change",
+          as.Date("2025-01-15"), "`windowed_seasonal_nssp` introduced to ensemble", "-",
+          as.Date("2025-01-22"), "data outage- no forecast", "`windowed_seasonal_nssp` introduced to ensemble, also missing data",
+          as.Date("2025-01-29"), "-", "-",
+          as.Date("2025-02-05"), "-", "`windowed_seasonal` introduced to ensemble",
+          as.Date("2025-02-12"), "-", "-",
+          as.Date("2025-02-19"), "-", "`windowed_seasonal` removed from ensemble",
+          as.Date("2025-02-26"), "-", "`windowed_seasonal` added to ensemble only for states where `nssp` is missing",
+          as.Date("2025-03-05"), "same from here on", "same from here on",
+          as.Date("2025-03-12"), "-", "-",
+          as.Date("2025-03-19"), "-", "-",
+          as.Date("2025-03-26"), "-", "-",
+          as.Date("2025-04-02"), "-", "-",
+          as.Date("2025-04-09"), "-", "-",
+          as.Date("2025-04-16"), "-", "-",
+          as.Date("2025-04-23"), "-", "-",
+          ) %>%
+    datatable(options = list(pageLength=100))
+  ```
+  </details>
+
+# Season Scoring
+<!-- NOTE: to run this, you must run `make prod-flu` and `make prod-covid` successfully, as it will read the targets directly from the store. -->
+
+In addition to the plots below, it is worth keeping in mind the all model comparisons from [flu eval dashboard](https://reichlab.io/flusight-dashboard/eval.html) and [covid](https://reichlab.io/covidhub-dashboard/eval.html).
+We've included the best models there below as well.
+
+For Flu, the best wis-scoring model there is `PSI-PROF` with a mean WIS of 128.6 vs the ensemble's 140.8 and `CMU-TimeSeries`'s 139.7[^1].
+The best MAE-scoring model there is `CEPH-Rtrend_fluH`, with a mean MAE of 187.4 vs the ensemble's 196.6 and `CMU-TimeSeries`'s 197.8.
+Most models are bad at getting 95% coverage, suggesting most teams have too narrow of extreme quantiles.
+50% coverage is more common, with about a quarter of forecasters being within a 40-60% range (including us).
+
+For Covid, there are far fewer models submitted overall.
+The best wis-scoring model is actually just the ensemble at 35.2, with the next-best being `UMass-ar6_pooled` at 37.8, compareed to `CMU-TimeSeries` at 44.8[^2].
+Coverage in covid is somewhat better, though a larger fraction of teams are within +/-10% of 95% coverage; we specifically got within 1%.
+Like with flu, there was systematic under-coverage though, so the models are also biased towards too small of intervals for the 95% band.
+The 50% coverage is likewise more accurate than for flu, with most forecasts within +/-10%.
+`CMU-TimeSeries` is at 52.7%, so slightly over.
+Generally, more teams were under 50% coverage than over, so there is also a systemic bias towards under-coverage in covid.
+
+## Flu Scores
+
+Note that `seasonal_nssp_latest` uses the latest version of the data, to see how much better or worse our forecasts might be if we could get a correct estimate of the revisions.
+
+Before we get into the actual scores, we need to define how we go about creating 3 different phases.
+They are `increasing`, `peak`, and `decreasing`.
+Roughly, `peak` is the interval where the value is within 50% of the max and the other two are before and after.
+For the details, see the fold.
+
+<details>
+  <summary> Splitting the season </summary>
+### Splitting the season
+
+Since our forecasters tend to do very differently depending on the phase in the pandemic, in addition to an overall score, let's split according to phase.
+There's a great deal of ambiguity in defining the phase however; to keep it simple, lets divide the season into 3 periods:
+
+1. `increasing` Before the peak; normally increasing but may include inital flat periods
+2. `peak` The time interval where the cases are oscillating near or around the peak
+3. `decreasing` The trailing end of the season after the peak; normally decreasing, but probably including flat periods towards the end
+
+2 is the most ambiguous of these, since sometimes there is a clean peak, and sometimes there are multiple peaks.
+To do this simply, let's see what seasons we get if we use "above 50% of the peak value" to define phase 2.
+
+```{r split_season_functions}
+flu_archive <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+flu_current <- flu_archive %>%
+  epix_as_of_current() %>%
+  filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+flu_max <- flu_current %>% group_by(geo_value) %>% summarize(max_value = max(value))
+compute_peak_season <- function(data_current, threshold = 0.5, start_of_year = as.Date("2024-11-01")) {
+  season_length <- data_current %>% pull(time_value) %>% max() - start_of_year
+  data_current %>%
+    filter(time_value > start_of_year) %>%
+    group_by(geo_value) %>%
+    mutate(max_val = max(value)) %>%
+    filter(value >= threshold * max_val) %>%
+    summarize(first_above = min(time_value), last_above = max(time_value)) %>%
+    mutate(
+      duration = last_above - first_above,
+      rel_duration = as.integer(duration) / as.integer(season_length))
+}
+classify_phase <- function(time_value, first_above, last_above, rel_duration, threshold) {
+  case_when(
+    rel_duration > threshold ~ "flat",
+    time_value < first_above ~ "increasing",
+    time_value > last_above ~ "decreasing",
+    .default = "peak"
+  ) %>% factor(levels = c("increasing", "peak", "decreasing", "flat"))
+}
+covid_flat_threshold <- 0.6
+flu_flat_threshold <- 0.9
+flu_within_max <- compute_peak_season(flu_current)
+sanity_check_classifying <- flu_current %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(time_value, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  group_by(geo_value) %>%
+  distinct(phase)
+```
+
+
+```{r flu_season_definitions, fig.width = 15, fig.height = 15}
+flu_current %>%
+  filter(time_value > "2024-11-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(data = flu_within_max, aes(xintercept = first_above)) +
+  geom_vline(data = flu_within_max, aes(xintercept = last_above)) +
+  facet_wrap(~geo_value, scale = "free") +
+  theme(legend.position = "none")
+```
+
+There is a wide variety of length for the peak by this definition, but it does seem to naturally reflect the difference in dynamics.
+`ok` is quite short for example, because it has a simple clean peak, whereas `or` has literally 2 peaks with the same height, so the entire interval between them is classified as peak.
+
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak.
+First, for the start:
+
+```{r flu_peak_start}
+flu_within_max %>%
+  filter(rel_duration < flu_flat_threshold) %>%
+  pull(first_above) %>%
+  summary()
+```
+
+So the `increasing` phase ends at earliest on November 2nd, on average on December 21st, and at the latest on January 4th.
+
+```{r flu_peak_end}
+flu_within_max %>%
+  filter(rel_duration < flu_flat_threshold) %>%
+  pull(last_above) %>%
+  summary()
+```
+
+Similarly, the `peak` phase ends at the earliest on January the 18th, on average on February 15th, and at the latest on April 5th.
+
+</details>
+
+### Forecaster Scores for Flu:  {.tabset}
+
+Forecast dates: `r forecast_dates`
+
+#### Scores Aggregated By Forecaster
+`geomean` here uses an offset of the smallest non-zero wis score for that forecaster (accounting for floating point zeros).
+Generally there are far too few to have a major effect (something like 2% of the scores).
+The standard deviation for a given forecaster is significantly larger than the actual mean, so we should avoid drawing too many conclusions from these overall scores.
+
+`pop_norm_wis` and `pop_norm_ae` are on a rate per 100,000.
+
+```{r flu_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
+flu_score_summary <- flu_scores %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  group_by(forecaster) %>%
+  mutate(
+    min_wis = min(wis[wis > 1e-5]),
+    min_ae = min(ae_median[ae_median > 1e-5])
+  ) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geo_wis = round(GeoMean(wis, min_wis), 2),
+    #nWISzero = sum(wis < 1e-5),
+    mean_ae = round(Mean(ae_median), 2),
+    pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    geo_ae = round(GeoMean(ae_median, min_ae), 2),
+    #nAEzero = sum(ae_median < 1e-5),
+    mean_cov_50 = round(Mean(interval_coverage_50), 2),
+    mean_cov_90 = round(Mean(interval_coverage_90), 2),
+    n = n()
+  ) %>%
+  rename(id = forecaster) %>%
+  arrange(mean_wis)
+wis_score_order <- flu_score_summary %>% pull(id)
+pop_score_order <- flu_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+flu_score_summary %>%
+  datatable()
+```
+
+#### Scores Aggregated By Phase
+Note that the standard deviation is frequently double the actual value, much like in the totally general case.
+Adding it to the plot here results in bands that wash out all variation between forecasters (which you can see by un-commenting the `geom_ribbon` line).
+
+```{r plot_flu_by_phase, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <-
+  flu_scores %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
+    ae_sd = sd(ae_median, na.rm = TRUE),
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.3) +
+  geom_point() +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=20), labels = scales::comma) +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Forecast Date
+Note that the standard deviation is frequently double the actual value, much like in the totally general case.
+Adding it to the plot here results in bands that wash out all variation between forecasters (which you can see by un-commenting the `geom_ribbon` line).
+
+```{r plot_flu_by_forecast_date, fig.height = 8, fig.width = 12, echo=FALSE}
+agg_flu <- flu_scores %>%
+  filter(forecast_date > "2024-10-01") %>%
+  filter(forecast_date != as.Date("2025-01-25")) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  group_by(forecaster, forecast_date) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    mean_pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    mean_pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    wis_sd = sd(wis),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg_flu, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Forecast Date", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Ahead
+
+```{r plot_flu_by_ahead, fig.height = 8, fig.width = 12, echo=FALSE}
+agg <- flu_scores %>%
+  group_by(forecaster, ahead) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Ahead", y = "Mean WIS")
+ggplotly(p)
+```
+
+#### Scores Aggregated By State
+These give population normalized WIS for each state and forecaster.
+Since there seems to be a nonlinear effect of population on the target variable, we
+include color giving population on a log scale.
+
+`pr` is unsurprisingly quite high for most forecasters.
+
+```{r flu_plot_geo_agged, fig.height = 8, fig.width = 12}
+scored_geo <- flu_scores %>%
+  group_by(forecaster, geo_value) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  ) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  left_join(flu_max, by = "geo_value") %>%
+  ungroup()
+pop_score_order <- flu_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+geo_plot <-
+  scored_geo %>%
+  mutate(forecaster = factor(forecaster, levels = pop_score_order)) %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(transform="log") +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+
+ggplotly(geo_plot)
+```
+
+#### Score Histograms
+
+The standard deviation is far too large to actually include it in any of the previous graphs and tables.
+It is routinely larger than the mean WIS.
+To try to represent this, in this tab we have the histogram of the WIS, split by phase and forecaster.
+Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
+Even after normalizing by population, there is a large variation in scale for the scores.
+
+The forecasters are arranged according to mean WIS.
+Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
+During the `peak`, very few forecasters actually have any results in the smallest bin; this implies that basically no forecasters were appreciably correct around the peak.
+
+In the `peak` and `decreasing` phases, the linear model simultaneously has a longer tail and a high degree of concentration otherwise, which implies it is both generally right, but catastrophically wrong when it's off.
+
+Comparing the `increasing` and `decreasing` phases across forecasters, `decreasing` tends to have a stronger concentration in the lowest two bins, but a much longer tail of large errors.
+
+```{r flu_score_histogram, fig.height = 20, fig.width = 13, echo=FALSE}
+#, levels = exp(seq(log(min(pop)), log(max(pop)), length.out = 10))
+flu_scores %>%
+  left_join(flu_within_max, by = "geo_value") %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(wis = wis * 1e5/pop) %>%
+  mutate(pop = factor(pop)) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  group_by(forecaster) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
+  geom_histogram(bins = 70) +
+  facet_grid(forecaster~ phase) +
+  labs(title = "Wis score histogram") +
+  ylab("count") +
+  xlab("wis, population normalized") +
+  theme(plot.title = element_text(hjust = 0.5), legend.position = "none") +
+  scale_color_viridis_d()
+```
+
+#### Sample Forecasts
+
+We're plotting the 80% CI along with the median.
+The locations were chosen based on the scores to have a sample of large and small states with large and small (population normalized) WIS.
+We've scaled so everything is in rates per 100k so that it's easier to actually compare; even so the peak value varies drastically.
+Forecasters we've produced are blue, while forecasters from other teams are red.
+They are ordered by `mean_wis` score, best to worst.
+
+```{r}
+tribble(
+  ~state, ~performance, ~population,
+  "ca", "~best", "large",
+  "dc", "~worst", "small",
+  "pa", "terrible", "large",
+  "hi", "~best", "small",
+  "tx", "good", "large"
+) %>% datatable()
+```
+
+```{r flu_plot_sample_forecast, fig.height = 20, fig.width = 13, echo=FALSE}
+plot_geos <- c("ca", "dc", "pa", "hi", "tx")
+filtered_flu_forecasts <- flu_forecasts %>%
+  filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
+
+
+flu_forecast_plt <- filtered_flu_forecasts %>%
+  filter(forecast_date %in% forecast_weeks_to_plot) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  mutate(our_forecaster = factor(forecaster %in% our_forecasters, levels = c(TRUE, FALSE))) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value * 1e5/ pop) %>%
+  pivot_wider(names_from = quantile, values_from = value) %>%
+  ggplot(aes(x = target_end_date)) +
+  geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`, color = our_forecaster, fill = our_forecaster, group = forecast_date), alpha = 0.5) +
+  geom_line(aes(y = `0.5`, color = our_forecaster, group = forecast_date)) +
+  geom_line(
+    data = flu_current %>%
+      filter(time_value > "2024-11-01", geo_value %in% plot_geos) %>%
+      left_join(state_census, by = join_by(geo_value == abbr)) %>%
+      mutate(value = value * 1e5/ pop),
+    aes(x = time_value, y = value)) +
+  scale_color_brewer(palette = "Set2") +
+  scale_fill_brewer(palette = "Set2") +
+  facet_grid(forecaster ~ geo_value, scale = "free") +
+  theme(legend.position = "none")
+
+ggplotly(flu_forecast_plt)
+```
+
+### Results
+
+Before digging into the score results, take a look at the Score histograms tab; all of these forecasters have a *wide* variation in actual score values, with the standard deviation frequently larger than the mean value.
+This means the mean scores below are going to be pretty sensitive to the large outliers.
+
+#### Overall scores
+
+Our best forecasters cluster around a mean wis between 124 and 132, with our actually submitted `CMU-TimeSeries` performing the best.
+The absolute best forecaster this season was `Psi-PROF`, with a `mean_wis` of 113, which is within 10% of `CMU-TimeSeries`.
+While we couldn't have improved our `mean_wis`, we could have improved our population normalized mean wis `pop_norm_wis` by using `windowed_seasonal_nssp` by ~10% as well.
+
+
+Absolute error before and after population normalizing has a similar story, though the order for models from other labs changes, and our relative score improves.
+
+Using `mean_cov_50`, our 50% coverage is generally good, with most models within 10 percentage points of 50%.
+There's quite a bit of variability in other's models, with most models having too narrow of 50% bands.
+
+Using `mean_cov_90`, most of these models have quantiles that are too narrow, though `CMU-TimeSeries` is within 5 percentage points; `climate_geo_agged` has a 90% coverage, but it is otherwise quite inaccurate.
+`retro_submission` does surprisingly well on this metric by hitting 90% exactly.
+
+
+Note that `seasonal_nssp_latest` is better than the other models by any metric, suggesting that revision information would be quite useful for forecasting, if we could get a hold of it. 
+It is otherwise identical to `windowed_seasonal_nssp`, so it improves by ~20% and has better coverage.
+Looking at the Phase tab, it does especially well during the `increasing` and `peak` phases, with about the same performance as `windowed_seasonal_nssp` in the `decreasing` phase.
+
+#### Phase
+
+Breaking up the scoring by phase, most forecasters cluster together pretty tightly, with only `FluSight-baseline`, `linear`, and both `climate` only models having appreciably worse scores.
+`climate_linear` is competitive in the `increasing` phase, but once we hit either the `peak` or `decreasing` it is less accurate; likely this is because this season was both higher and had a longer duration than previous ones.
+All of the models do ~twice as worse at the peak as during either the `increasing` or `decreasing` phases, with most models doing marginally better during the `decreasing` phase than the `increasing` phase.
+It is worth noting that phase doesn't correspond to just grouping the dates, because different geographies enter a new phase at different times.
+
+#### Ahead
+
+Factoring by ahead, the models that include an AR component generally degrade with ahead less badly.
+Interestingly, the pure `climate` models having a mostly consistent (and bad) score, but remains much more consistent as aheads increase (after the -1 ahead where it typically has exact data).
+
+#### Sample forecasts
+
+Looking at a couple of forecasts, it primarily looks like our models were off because they were predicting the downturn far too early.
+Not as badly as our [pure AR forecasters](decreasing_forecasters.html) were however.
+The well performing models from other teams also had this behavior this year.
+
+
+
+## Covid Scores
+
+Before we get into the actual scores, we need to define how we go about creating 4 different phases.
+They are `increasing`, `peak`, `decreasing`, and `flat`.
+The last phase, `flat`, covers geos which didn't have an appreciable season for the year, which was relatively common for covid.
+For the details, see the fold.
+
+<details>
+  <summary> Splitting the season </summary>
+### Splitting the season
+
+Lets see what kind of phase boundaries we get by reusing the concept from flu for covid.
+That is, the peak phase for a given geo is defined to be the interval when the value is within 50% of the peak.
+
+```{r}
+covid_archive <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+covid_current <- covid_archive %>%
+  epix_as_of_current() %>%
+  filter(geo_value %nin% c("as", "gu", "mp", "vi"))
+covid_max <- covid_current %>% group_by(geo_value) %>% summarize(max_value = max(value))
+covid_within_max <- compute_peak_season(covid_current)
+```
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2024-11-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(data = covid_within_max, aes(xintercept = first_above)) +
+  geom_vline(data = covid_within_max, aes(xintercept = last_above)) +
+  facet_wrap(~geo_value, scale = "free") +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+This definition may be a bit more problematic for covid than for flu.
+`dc` `ga`, `nc`, and `nv` bin ~the entire season into the "peak" phase.
+Primarily this is actually because only some locations actually had a covid season this year; if we drop the filter for this season and add a vline indicating the season start:
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2022-06-01") %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(aes(xintercept = as.Date("2024-11-01"))) +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+Then we can see a very muted season in many locations, such as `ar` or `co`, and no season at all in some locations, such as `ak`.
+Others, such as `az`, `in`, or `mn` have a season that is on-par with historical ones.
+
+How to handle this?
+One option is to include a separate phase for no season that applies to the entire `geo_value` if more than half of the `time_value`s are within 50% of the peak:
+
+```{r}
+no_phase <- covid_within_max %>% arrange(desc(rel_duration)) %>% filter(rel_duration > covid_flat_threshold)
+no_phase %>% arrange(rel_duration)
+```
+
+which is 27 out of our 53 geos.
+`r covid_flat_threshold` is admittedly a pretty arbitrary cut-off, chosen so that `geo_value`s with high `rel_duration`s which still appear to have a clear peak, such as `de`, `us`, `wy`, and `mi` aren't assigned to `flat`.
+We can probably decrase this filter as we move later into the season and locations which are still decreasing finish dropping.
+As a sanity check, let's plot just these locations to confirm that we're not pulling in geos with actual peaks
+
+```{r, fig.width = 15, fig.height = 15}
+covid_current %>%
+  filter(time_value > "2022-06-01") %>%
+  filter(geo_value %in% no_phase$geo_value) %>%
+  autoplot(value, .facet_by = "geo_value") +
+  geom_vline(aes(xintercept = as.Date("2024-11-01"))) +
+  ylim(0, NA) +
+  theme(legend.position = "none")
+```
+
+Possible exceptions:
+
+- `pa` unfortunately does seem to have a season, but because it has an early wave, the interval counted as `peak` is too wide. Hopefully as the season actually concludes this will go away.
+- `nc` has a weak peak, but it has only recently declined below the 50% mark. It is likely it will be reclassified after the season is actually over.
+
+There are several locations such as `al` and `ar` which don't have a peak so much as an elevated level for approximately the entire period.
+This is awkward to handle for this classification.
+
+Finally, like for Flu, we should examine a summary of the start/end dates for the peak of the covid season.
+Boiling down these plots somewhat, let's look at the averages for the start of the peak and the end of the peak.
+First, for the start of start of the peak:
+
+```{r}
+covid_within_max$first_above %>% summary()
+```
+
+Second, for the end of the peak:
+
+```{r}
+covid_within_max$last_above %>% summary()
+```
+
+</details>
+
+### Forecaster Scores for Covid:  {.tabset}
+
+Forecast dates: `r forecast_dates`
+
+#### Scores Aggregated By Forecaster
+
+```{r covid_datatable, fig.height = 60, fig.width = 12, echo=FALSE}
+covid_score_summary <- covid_scores %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  group_by(forecaster) %>%
+  mutate(
+    min_wis = min(wis[wis > 1e-5]),
+    min_ae = min(ae_median[ae_median > 1e-5])
+  ) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis, min_wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    pop_norm_ae = round(Mean(ae_median*1e5/pop), 2),
+    geomean_ae = round(GeoMean(ae_median, min_ae), 2),
+    mean_coverage_50 = round(Mean(interval_coverage_50), 2),
+    mean_coverage_90 = round(Mean(interval_coverage_90), 2),
+    n = n()
+  ) %>%
+  arrange(mean_wis) %>%
+  rename(id = forecaster)
+covid_score_summary %>%
+  datatable()
+```
+
+#### Scores Aggregated By Phase
+
+```{r plot_covid_by_phase, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <- covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.1) +
+  geom_point() +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Phase (no flat)
+
+Since the `flat` classification is somewhat ambiguous, we should also bin everything as we otherwise would.
+
+```{r plot_covid_by_phase_no_flat, fig.height = 8, fig.width = 12, echo=FALSE}
+phase_scores <- covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, 1)) %>%
+  group_by(forecaster, phase) %>%
+  summarize(
+    wis_sd = sd(wis, na.rm = TRUE),
+    across(c(wis, ae_median, interval_coverage_90), \(x) round(Mean(x), 2)),
+    n = n(),
+    .groups = "drop"
+  )
+p <- ggplot(phase_scores, aes(x = phase, y = wis, color = forecaster, group = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = wis - wis_sd, ymax = wis + wis_sd, fill = forecaster), alpha = 0.1) +
+  geom_point() +
+  theme_bw() +
+  labs(x = "Phase", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Forecast Date
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg_covid <- covid_scores %>%
+  filter(forecast_date > "2024-10-01") %>%
+  filter(forecast_date != as.Date("2025-01-25")) %>%
+  group_by(forecaster, forecast_date) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    wis_sd = round(sd(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg_covid, aes(x = forecast_date, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Forecast Date", y = "Mean WIS")
+
+ggplotly(p)
+```
+
+#### Scores Aggregated By Ahead
+
+```{r, fig.height = 8, fig.width = 12, echo=FALSE}
+agg <- covid_scores %>%
+  group_by(forecaster, ahead) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    wis_sd = round(sd(wis), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  )
+
+# Plot the scores as lines across forecast_date
+p <- ggplot(agg, aes(x = ahead, y = mean_wis, color = forecaster)) +
+  geom_line() +
+  #geom_ribbon(aes(ymin = mean_wis - wis_sd, ymax = mean_wis + wis_sd, fill = forecaster), alpha = 0.1) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  labs(x = "Ahead", y = "Mean WIS")
+ggplotly(p)
+```
+
+
+#### Scores Aggregated By State
+
+These give population normalized WIS for each state and forecaster.
+Since there seems to be a nonlinear effect of population on the target variable, we
+include color giving population on a log scale.
+They are ordered by their average population normalized WIS.
+We have a separate plot for `climate_geo_agged` because it does so poorly on the small states that it washes out our ability to compare across states and forecasters (note that max is an order of magnitude higher, 2.4 vs 26).
+
+If you want to see a non-population scaled version of this, switch `y = pop_norm_wis` to `y = mean_wis` below, and comment out the `climate_geo_agged` filter.
+
+```{r covid_plot_geo_agged, fig.height = 8, fig.width = 12}
+pop_wis_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+score_geo <- covid_scores %>%
+  group_by(forecaster, geo_value) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  summarize(
+    mean_wis = round(Mean(wis), 2),
+    pop_norm_wis = round(Mean(wis *1e5/pop), 2),
+    geomean_wis = round(GeoMean(wis), 2),
+    mean_ae = round(Mean(ae_median), 2),
+    geomean_ae = round(GeoMean(ae_median), 2),
+    mean_interval_coverage_90 = round(Mean(interval_coverage_90), 2),
+  ) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  ungroup() %>%
+  mutate(forecaster = factor(forecaster, levels = pop_wis_order))
+
+geo_plot <- score_geo %>%
+  filter(forecaster != "climate_geo_agged") %>%
+  #mutate(mean_wis = pmin(mean_wis, y_limit)) %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(transform="log") +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+
+ggplotly(geo_plot)
+```
+
+```{r}
+score_geo %>%
+  filter(forecaster == "climate_geo_agged") %>%
+  ggplot(aes(x = geo_value, y = pop_norm_wis, fill = pop)) +
+  geom_col() +
+  facet_wrap(~forecaster) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n=10), labels = scales::comma) +
+  scale_fill_viridis_c(breaks = scales::breaks_log(n=4), labels = scales::label_log(), transform="log") +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.0, hjust = 0.75))
+```
+
+#### Score Histograms
+
+The standard deviation is far too large to actually include it in any of the previous graphs and tables meaningfully.
+It is routinely larger than the wis value itself.
+Like with Flu, in this tab we have the histogram of the wis, split by phase and forecaster.
+Color below represents population, with darker blue corresponding to low `geo_value` population, and yellow representing high population (this is viridis).
+Even after normalizing by population, there is a variation in scale for the scores.
+
+The forecasters are ordered according to mean WIS.
+Concentration towards the left corresponds to a better score; for example, `peak` is frequently a flatter distribution, which means most models are doing worse than they were during the `increasing` period.
+
+Like in Flu, in the `peak` phase, basically all forecasters are basically missing the first bin, so no forecasters are right during the peak.
+Unlike in Flu, the `flat` phase exists, and roughly resembles `decreasing` in distribution.
+`increasing` is overall a much smaller proportion of all samples.
+
+`climate_base` is the closest any of these scores have come to normally distributed.
+`climate_geo_agged` is particularly bad for Covid.
+
+```{r covid_plot_score_histogram, fig.height = 23, fig.width = 13}
+pop_score_order <- covid_score_summary %>% arrange(pop_norm_wis) %>% pull(id)
+wis_score_order <- covid_score_summary %>% arrange(mean_wis) %>% pull(id)
+covid_scores %>%
+  left_join(covid_within_max, by = "geo_value") %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(wis = wis * 1e5/pop) %>%
+  mutate(pop = factor(pop)) %>%
+  mutate(forecaster = factor(forecaster, levels = pop_score_order)) %>%
+  group_by(forecaster) %>%
+  mutate(phase = classify_phase(target_end_date, first_above, last_above, rel_duration, covid_flat_threshold)) %>%
+  ggplot(aes(x = wis, color = pop,  y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
+  geom_histogram(bins = 120) +
+  facet_grid(forecaster~ phase) +
+  labs(title = "Wis score histogram") +
+  ylab("count") +
+  xlab("wis, population normalized") +
+  theme(plot.title = element_text(hjust = 0.5), legend.position = "none") +
+  scale_color_viridis_d()
+```
+
+#### Sample Forecasts
+
+We're plotting the 80% CI along with the median.
+The locations were chosen based on the scores to have a sample of large and small states with large and small (population normalized) WIS.
+We've scaled so everything is in rates per 100k so that it's easier to actually compare; even so the peak value varies drastically.
+Forecasters we've produced are blue, while forecasters from other teams are red.
+They are ordered by `mean_wis` score, best to worst.
+
+```{r covid_plot_sample_forecast, fig.height = 23, fig.width = 13, echo=FALSE}
+plot_geos <- c("ca", "de", "pa", "nh", "tx")
+filtered_covid_forecasts <- covid_forecasts %>%
+  ungroup() %>%
+  filter(quantile %in% c(0.1, 0.5, 0.9), geo_value %in% plot_geos)
+
+covid_forecast_plt <- filtered_covid_forecasts %>%
+  filter(forecast_date %in% forecast_weeks_to_plot) %>%
+  mutate(forecaster = factor(forecaster, levels = wis_score_order)) %>%
+  mutate(our_forecaster = factor(forecaster %in% our_forecasters, levels = c(TRUE, FALSE))) %>%
+  left_join(state_census, by = join_by(geo_value == abbr)) %>%
+  mutate(value = value * 1e5/ pop) %>%
+  pivot_wider(names_from = quantile, values_from = value) %>%
+  ggplot(aes(x = target_end_date)) +
+  geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`, color = our_forecaster, fill = our_forecaster, group = forecast_date), alpha = 0.5) +
+  geom_line(aes(y = `0.5`, color = our_forecaster, group = forecast_date)) +
+  geom_line(
+    data = covid_current %>%
+      filter(time_value > "2024-11-01", geo_value %in% plot_geos) %>%
+      left_join(state_census, by = join_by(geo_value == abbr)) %>%
+      mutate(value = value * 1e5/ pop),
+    aes(x = time_value, y = value)) +
+  scale_color_brewer(palette = "Set2") +
+  scale_fill_brewer(palette = "Set2") +
+  facet_grid(forecaster ~ geo_value, scale = "free") +
+  theme(legend.position = "none")
+
+ggplotly(covid_forecast_plt)
+```
+
+### Results
+
+One peculiar thing about Covid scoring: on the first forecast date, CMU-TimeSeries has *much* worse scores than almost any of the subsequent days (you can see this in the Scores Aggregated By Forecast Date tab above).
+There are two related issues here:
+
+- first, our initial model simply averaged `climate_base` and `linear`, and the `climate_base` component was unusually bad early in the season, because this season started later than previous seasons,
+- second, the data had substantial revisions (this is discussed in detail in [this notebook](first_day_wrong.html)), however this effect is much smaller, since other forecasters had access to the same data.
+
+This mishap dragged the CMU-TimeSeries score down overall by quite a lot and its better performance later in the season is not enough to make up for it.
+
+Overall, the best covid forecaster is `windowed_seasonal_nssp`, outperforming `CovidHub-ensemble`, regardless of the metric used.
+This forecaster uses a window of data around the given time period, along with the NSSP exogenous features.
+`ensemble_windowed` is nearly as good, but since it is effectively averaging `windowed_seasonal_nssp` with `windowed_seasonal` and losing accuracy as a result, so it is hardly worth it.
+Given its simplicity, the `climate_linear` forecaster does quite well, though it's not as good as `windowed_seasonal_nssp`.
+
+The pure climate models were substantially worse for covid than for flu, at ~4.6x the best model, rather than ~2x.
+Given the unusual nature of the season, this is somewhat unsurprising.
+
+To some degree this explains the poor performance of `CMU-TimeSeries`.
+You can see this by looking at the "Scores Aggregated By Forecast Date" tab, where the first 3 weeks of `CMU-TimeSeries` are significantly worse than `climate_linear`, let alone the ensemble or our best models.
+
+`seasonal_nssp_latest`, which has access to the latest data, doesn't have a significantly different score from `windowed_seasonal_nssp`, which it is otherwise identical to.
+
+#### Aggregated by phase
+
+There are two tabs dedicated to this, one with and one without a separate `flat` phase, which labels an entire state as `flat` if the duration of the `peak` is too long.
+Either way, the general shape is similar to Flu, with `increasing` scores lower than `peak` scores, but higher than `decreasing` scores.
+All of the phases are closer together than they were in the case of Flu, with the best `peak` phase forecaster nearly better than the worst `increasing` phase forecaster.
+`flat` roughly resembles increasing.
+Even disregarding the climate models, the distribution within a phase is wider than it was in the case of Flu.
+`windowed_seasonal_nssp` particularly shines during the `peak` and to some degree the `decreasing` phases.
+
+#### Aggregated by ahead
+
+Nothing terribly surprising here, most models are ~linear in score at increasing ahead.
+`windowed_seasonal_nssp` is the exception, which does comparatively worse at further aheads.
+
+#### Aggregated by State
+
+Across all forecasters, `wy` is a particularly difficult location to forecast, while `ca` is particularly easy.
+Scores don't seem to correlate particularly well with the population of the state.
+The variation in state scores for other group's forecasters is fairly similar to our non-climate forecasters.
+Both climate forecasters have a different distribution of which states are correct and which are wrong, and differ greatly from each-other.
+
+#### Sample Forecasts
+
+The always decreasing problem is definitely not present in these forecasts.
+If anything, our best forecasts are *too* eager to predict an increasing value, e.g. in `tx` and `ca`.
+Several of our worse forecasts are clearly caused by revision behavior.
+
+
+# Revision behavior and data substitution
+
+This is covered in more detail in [revision_summary_report_2025](revision_summary_2025.html).
+NHSN has substantial under-reporting behavior that is fairly consistent for any single geo, though there a number of aberrant revisions, some of which change the entire trajectory for a couple of weeks.
+This is even more true for NSSP than NHSN, though the size of the revisions is much smaller, and they occur more quickly.
+Because of the speed in revision behavior, it matters only for prediction, rather than for correcting data for fitting the forecaster.
+We can probably improve our forecasts by incorporating revision behavior for both nhsn and nssp.
+
+Further, flu and covid revision behavior is fairly strongly correlated; it is reported through the same channels by the same people, so this makes sense.
+We should look into the extra columns to see if it provides useful information for handling revision behavior.
+
+### Revision examples
+
+```{r}
+nhsn_archive_flu <- qs2::qs_read(here::here("flu_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_flu <- nhsn_archive_flu$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_flu$time_type <- "day"
+revision_summary <- nhsn_archive_flu %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+nhsn_archive_flu %>% autoplot(value, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) +
+  theme(strip.text.x = element_text(size = 8), plot.title = element_text(hjust = 0.5)) +
+  labs(title = "Flu revisions for the highest mean relative spread")
+```
+
+```{r}
+nhsn_archive_covid <- qs2::qs_read(here::here("covid_hosp_prod", "objects", "nhsn_archive_data"))
+
+nhsn_archive_covid <- nhsn_archive_covid$DT %>% filter(time_value >= "2024-11-19", geo_value %nin% c("vi", "as", "gu")) %>% as_epi_archive()
+nhsn_archive_covid$time_type <- "day"
+revision_summary <- nhsn_archive_covid %>% epiprocess::revision_analysis(value, min_waiting_period = NULL)
+av_re_spread <- revision_summary$revision_behavior %>%
+  group_by(geo_value) %>%
+  summarize(rel_spread = mean(rel_spread, na.rm = TRUE)) %>%
+  arrange(desc(rel_spread)) %>%
+  filter(geo_value %nin% c("vi", "as", "gu"))
+nhsn_archive_covid %>% autoplot(value, .facet_filter = geo_value %in% av_re_spread$geo_value[1:9]) +
+  theme(strip.text.x = element_text(size = 8), plot.title = element_text(hjust = 0.5)) +
+  labs(title = "Covid revisions for the highest mean relative spread")
+```
+
+### Data substitution
+
+In short, this didn't go well.
+It was a coin toss for covid, and worse than not doing corrections for flu.
+
+
+[^1]: this is off from our local version of the scoring by .6, which is nonzero but not appreciably different.
+      It's scored on N=4160 vs the local 3692, which probably comes down to negative aheads.
+      Note that both "bests" in this paragraph are ignoring models which have far fewer submission values, since they're likely to be unrepresentative.
+
+[^2]: this is further off both in absolute and further yet in relative terms from our local scoring, which has `CMU-TimeSeries` at 46.32 rather than 44.8.
+      It's unclear why; there are 3952 samples scored on the remote vs 3692 locally, so ~300 scored there that we don't score where we apparently did better.
+
+# Further methods to explore
+
+The [decreasing forecasters notebook](decreasing_forecasters.html) has a number of suggestions, though that is a problem that occurs most frequently with Flu data rather than Covid.
+The broad categories there are 
+
+1. Filtering to the relevant phase; `windowed_seasonal_*` is roughly an example of this, which is likely why it outperformed simple AR by enough to be better.
+2. better use of non-linear models. This would allow us to capture increasing, decreasing, and flat trends in the same model.
+3. Explicitly using the growth rate as a co-variate, or fitting on differences rather than the raw value.
+
+A perhaps unexpected direction that came out of that notebook is some sort of non-linear per-state scaling; we found that scaling the counts by population^2 (or scaling rates by population) eliminated the decreasing forecasts problem.
+Hopefully the Yeo-Johnson step we have been working on will be able to choose an appropriate scaling factor to more systematically recreate this.
+
+Given the high utility of NSSP this season, the most important thing we can do is look for useful leading exogenous variables.
+In addition to signals in epidata and processed versions of those, NHSN releases a large number of accompanying variables which we should consider more explicitly.
+
+In addition to that, incorporating revision behavior would likely yield some results.
+The performance of `seasonal_nssp_latest` on Flu supports this, though it's performance on Covid was surprisingly lackluster.