diff --git a/common/covidcast.bib b/common/covidcast.bib
index 804224a..8145519 100644
--- a/common/covidcast.bib
+++ b/common/covidcast.bib
@@ -567,7 +567,7 @@ @misc{EvalcastR
 @book{Hyndman:2018,
 	author = {Hyndman, Rob J and Athanasopoulos, George},
 	publisher = {OTexts},
-	title = {Forecasting: {P}rinciples and Practice},
+	title = {Forecasting: Principles and Practice},
 	year = {2018}}
 
 @misc{ForecastHub,
@@ -970,4 +970,236 @@ @article{Bilinski:2021
 number = {6},
 pages = {1825--1828}, 
 Bdsk-Url-1 = {https://doi.org/10.1007/s11606-021-06633-8},
+}
+
+
+%% Pile of Forecast Refs
+
+%% Real Time Data
+
+
+@article{faust2009a,
+	author = {Faust, Jon and Wright, Jonathan H.},
+	journal = {Journal of Business and Economic Statistics},
+	pages = {468---479},
+	title = {Comparing Greenbook and Reduced Form Forecasts using a Large Realtime Dataset},
+	volume = {27},
+	year = {2009}}
+
+@article{diebold1991a,
+	author = {Diebold, Francis X. and Rudebusch, Glenn D.},
+	journal = {Journal of the American Statistical Association},
+	pages = {603---610},
+	title = {Forecasting Output With the Composite Leading Index: A Real-Time Analysis},
+	volume = {86},
+	year = {1991}}
+
+@article{patterson1995a,
+	author = {Patterson, K.D.},
+	journal = {International Journal of Forecasting},
+	pages = {395---405},
+	title = {Forecasting the Final Vintage of Real Personal Disposable Income: A State Space Approach},
+	volume = {11},
+	year = {1995}}
+
+@incollection{croushore2011a,
+	author = {Croushore, Dean},
+	booktitle = {The Oxford Handbook of Economic Forecasting},
+	editor = {Hendry, David F. and Clements, Michael P.},
+	pages = {247---267},
+	publisher = {Oxford University Press},
+	title = {Forecasting with real-time data vintages},
+	year = {2011}}
+
+@incollection{croushore2006a,
+	address = {Amsterdam},
+	author = {Croushore, Dean},
+	booktitle = {Handbook of Economic Forecasting},
+	editor = {Elliott, Graham and Granger, Clive W.J. and Timmermann, Allan},
+	pages = {961--972},
+	publisher = {North-Holland},
+	title = {Forecasting with Real-Time Macroeconomic Data},
+	year = {2006}}
+
+@incollection{harvey1983a,
+	address = {Washington, D.C},
+	author = {Harvey, A.C. and McKenzie, C.R. and Blake, D.P.C. and Desai, M.J.},
+	booktitle = {Applied Time Series Analysis of Economic Data},
+	editor = {Zellner, Arnold},
+	pages = {329---347},
+	publisher = {U.S. Department of Commerce},
+	title = {Irregular Data Revisions},
+	type = {Economic Research Report ER-5,},
+	year = {1983}}
+
+@article{mankiw1986a,
+	author = {Mankiw, N.Gregory and Shapiro, Matthew D.},
+	journal = {Survey of Current Business},
+	pages = {20--25},
+  volume = {66},
+	title = {News or Noise: An Analysis of GNP Revisions},
+	year = {1986}}
+
+@article{trivellato1986a,
+	author = {Trivellato, Ugo and Rettore, Enrice},
+	journal = {Journal of Business and Economic Statistics},
+	pages = {445---453},
+	title = {Preliminary Data Errors and Their Impact on the Forecast Error of Simultaneous-Equations Models},
+	volume = {4},
+	year = {1986}}
+
+@article{clark2009a,
+	author = {Clark, Todd E. and McCracken, Michael W.},
+	journal = {Journal of Business and Economic Statistics},
+	pages = {441---454},
+	title = {Tests of Equal Predictive Ability with Real-Time Data},
+	volume = {27},
+	year = {2009}}
+
+@article{koenig2003a,
+	author = {Koenig, Evan and Dolmas, Sheila and Piger, Jeremy},
+	journal = {Review of Economics and Statistics},
+	pages = {618---628},
+	title = {The Use and Abuse of `Real-Time' Data in Economic Forecasting},
+	volume = {85},
+	year = {2003}}
+
+@article{howrey1978a,
+	author = {Howrey, E.Philip},
+	journal = {Review of Economics and Statistics},
+	pages = {193---200},
+	title = {The Use of Preliminary Data in Econometric Forecasting},
+	volume = {60},
+	year = {1978}}
+
+% Leadingness
+
+@article{yeats1972a,
+	author = {Yeats, A.J.},
+	journal = {Business Economics},
+	pages = {7---11},
+	title = {An Alternative Approach to the Identification of Leading Indicators},
+	volume = {7},
+	year = {1972}}
+
+@article{emerson1996a,
+	author = {Emerson, R.A. and Hendry, D.F.},
+	journal = {Journal of Forecasting},
+	pages = {271---291},
+	title = {An evaluation of forecasting using leading indicators},
+	volume = {15},
+	year = {1996}}
+
+@article{koch1988a,
+	author = {Koch, P. and Rasche, R.H.},
+	journal = {Journal of Business and Economic Statistics},
+	pages = {167---187},
+	title = {An examination of the commerce department leading-indicator approach'},
+	volume = {6},
+	year = {1988}}
+
+@incollection{sargent1977a,
+	author = {Sargent, Thomas and Sims, Christopher},
+	booktitle = {New Methods in Business Cycle Research},
+	pages = {45--109},
+	publisher = {Federal Reserve Bank of Minneapolis},
+	title = {Business cycle modeling without pretending to have too much a priori economic theory},
+	year = {1977}}
+
+@article{engle1987a,
+	author = {Engle, Robert F. and Granger, Clive W.J.},
+	journal = {Econometrica},
+	pages = {251--276},
+  volume = {55},
+	title = {Co-integration and error correction: Representation, estimation, and testing},
+	year = {1987}}
+
+@incollection{shiskin1968a,
+	address = {New York},
+	author = {Shiskin, J. and Moore, G.},
+	booktitle = {Supplement to National Bureau Report One},
+	pages = {1--8},
+	publisher = {NBER},
+	title = {Composite Indexes of Leading, Coinciding and Lagging Indicators, 1948--1967},
+	year = {1968}}
+
+@article{granger1969a,
+	author = {Granger, Clive W.J.},
+	journal = {Econometrica},
+	pages = {424---438},
+	title = {Investigating causal relations by econometric models and cross-spectral methods},
+  volume = {37},
+	year = {1969}}
+
+@book{lahiri1991a,
+	author = {Lahiri, K. and Moore, G.H.},
+	publisher = {Cambridge University Press},
+	title = {Leading economic indicators: New approaches and forecasting records},
+	year = {1992}}
+
+@article{koopmans1947a,
+	author = {Koopmans, T.C.},
+	journal = {Review of Economics and Statistics},
+	pages = {161---179},
+	title = {Measurement without theory},
+	volume = {29},
+	year = {1947}}
+
+@article{StockWatson1989,
+	author = {Stock, James H. and Watson, Mark W.},
+	journal = {NBER Macroeconomics Annual},
+	pages = {351---394},
+	title = {New Indexes of Coincident and Leading Economic Indicators},
+	volume = {4},
+	year = {1989}}
+
+@article{diebold1989a,
+	author = {Diebold, F.X. and Rudebusch, G.D.},
+	journal = {Journal of Business},
+	pages = {369---391},
+	title = {Scoring the leading indicators'},
+	volume = {62},
+	year = {1989}}
+
+@book{mitchell1938a,
+	address = {New York},
+	author = {Mitchell, W. and Burns, A.F.},
+	publisher = {NBER Bulletin 69},
+	title = {Statistical Indicators of Cyclical Revivals},
+	year = {1938}}
+
+@article{auerbach1982a,
+	author = {Auerbach, A.J.},
+	journal = {Review of Economics and Statistics},
+	pages = {589---595},
+	title = {The index of leading indicators: Measurement without theory thirty-five years later'},
+	volume = {64},
+	year = {1982}}
+
+@article{hamilton1996a,
+	author = {Hamilton, James D. and Perez-Quiros, Gabriel},
+	journal = {The Journal of Business},
+	pages = {27---49},
+	title = {What Do the Leading Indicators Lead?},
+	volume = {69},
+	year = {1996}}
+
+@article{ioannidis2020a,
+  author = {Ioannidis, J.P.A. and Cripps, S. and Tanner, M.A.},
+  title = {Forecasting for COVID-19 has failed},
+    year = {2020},
+  volume = {10},
+  doi = {10.1016/j.ijforecast.2020.08.004},
+    journal = {International Journal of Forecasting}
+}
+
+@article{chakraborty2018know,
+  title={What to know before forecasting the flu},
+  author={Chakraborty, Prithwish and Lewis, Bryan and Eubank, Stephen and Brownstein, John S and Marathe, Madhav and Ramakrishnan, Naren},
+  journal={PLoS computational biology},
+  volume={14},
+  number={10},
+  pages={e1005964},
+  year={2018},
+  publisher={Public Library of Science San Francisco, CA USA}
 }
\ No newline at end of file
diff --git a/forecast/code/deprecated/compare-baselines.R b/forecast/code/deprecated/compare-baselines.R
new file mode 100644
index 0000000..7714f1d
--- /dev/null
+++ b/forecast/code/deprecated/compare-baselines.R
@@ -0,0 +1,78 @@
+library(evalcast)
+library(covidcast)
+library(tidyverse)
+library(aws.s3)
+Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
+s3bucket <- get_bucket("forecast-eval")
+
+
+# Point to the data Addison sent
+ours <- readRDS(here::here("data", "results_honest_states.RDS"))
+baseline <- ours %>% filter(forecaster == "Baseline") %>%
+  select(-forecaster) %>% rename(strawman_wis = wis)
+
+# these are epiweek / num
+bpreds <- s3readRDS("predictions_cards.rds", s3bucket) %>%
+  filter(forecaster == "COVIDhub-baseline", 
+         signal == "confirmed_incidence_num",
+         forecast_date < "2021-01-01", 
+         ahead %in% 1:4,
+         !(geo_value %in% c("as", "gu", "pr", "vi", "mp", "us")))
+
+# trying with the prop signal, since that's what we use
+actuals <- covidcast_signal("jhu-csse", "confirmed_7dav_incidence_prop", 
+                            end_day = "2021-02-01", geo_type = "state", 
+                            as_of = "2021-05-18") %>% 
+  select(geo_value, time_value, value) %>%
+  rename(target_end_date = time_value, actual = value)
+
+# Population / 100,000
+pop <- covidcast::state_census %>% select(ABBR, POPESTIMATE2019) %>%
+  mutate(geo_value = tolower(ABBR), POPESTIMATE2019 = POPESTIMATE2019 / 1e5) %>% 
+  select(-ABBR)
+
+# Scale epiweek total to daily prop
+bpreds2 <- left_join(bpreds, pop) %>% 
+  mutate(value = value / (7 * POPESTIMATE2019)) %>%
+  select(-POPESTIMATE2019, -incidence_period)
+
+# Score the Hub Baseline
+bscores <- evaluate_predictions(bpreds2, actuals, 
+                                grp_vars = c("ahead", "forecast_date", "geo_value"),
+                                err_measures = list(wis = weighted_interval_score))
+
+comb <- left_join(bscores %>% 
+                    # submissions made on Monday with Sunday target
+                    mutate(ahead = ahead * 7 - 2, 
+                           forecast_date = target_end_date - ahead) %>%
+                    select(forecast_date, target_end_date, geo_value, wis), 
+                  baseline %>% 
+                    select(forecast_date, target_end_date, geo_value, strawman_wis), 
+                  by = c("geo_value", "forecast_date", "target_end_date")) %>%
+  filter(!is.na(strawman_wis), !is.na(wis))
+
+
+
+
+ggplot(comb, aes(wis, strawman_wis, color = geo_value)) + 
+  geom_point() + 
+  theme_bw() +
+  theme(legend.position = "none") + 
+  geom_abline(intercept =0, slope = 1) + 
+  scale_x_log10() +
+  scale_y_log10() +
+  xlab("WIS of COVIDhub-baseline") + 
+  ylab("WIS or our baseline") + 
+  coord_equal()
+
+ncomb <- comb %>%
+  mutate(ahead = target_end_date - forecast_date) %>%
+  pivot_longer(contains("wis"), names_to = "forecaster") %>%
+  group_by(forecaster, ahead) %>%
+  summarise(wis  = Mean(value), geo_wis = GeoMean(value)) %>%
+  mutate(forecaster = recode(forecaster, wis = "Hub", strawman_wis = "Ours")) %>%
+  pivot_longer(contains("wis"))
+ggplot(ncomb, aes(as.character(ahead), value, fill = forecaster)) +
+  geom_col(position = "dodge") +
+  facet_wrap(~name)
+  
\ No newline at end of file
diff --git a/forecast/code/deprecated/compare-to-hub.R b/forecast/code/deprecated/compare-to-hub.R
deleted file mode 100644
index 8c052b6..0000000
--- a/forecast/code/deprecated/compare-to-hub.R
+++ /dev/null
@@ -1,95 +0,0 @@
-library(aws.s3)
-Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
-s3bucket <- get_bucket("forecast-eval")
-n_keeper_weeks <- 6L
-n_keeper_locs <- 50L
-case_scores <- s3readRDS("score_cards_state_cases.rds", s3bucket)
-case_scores <- case_scores %>% 
-  mutate(forecast_date = target_end_date - ahead * 7) %>% 
-  # fix weirdnesses about submission dates
-  filter(forecast_date < "2021-01-01") %>%
-  select(ahead, geo_value, forecaster, target_end_date, wis, forecast_date)
-
-strawman <- case_scores %>% filter(forecaster == "COVIDhub-baseline")
-case_scores <- left_join(
-  case_scores, 
-  strawman %>% 
-    select(forecast_date, target_end_date, geo_value, wis) %>% 
-    rename(strawman_wis = wis)
-  )
-case_scores <- case_scores %>% 
-  filter(forecaster != "COVIDhub-baseline")
-n_submitted <- case_scores %>% 
-  filter(ahead == 2) %>% 
-  group_by(forecaster) %>% 
-  summarise(nfcasts = n())
-keepers <- n_submitted %>% 
-  filter(nfcasts / n_keeper_locs > n_keeper_weeks - .0001) %>% 
-  # submitted at least z weeks for x locations
-  pull(forecaster)
-
-case_scores <- case_scores %>% filter(forecaster %in% keepers)
-all_time_performance <- case_scores %>%
-  group_by(forecaster, ahead) %>%
-  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
-            geo_wis1 = GeoMean((wis + 1) / (strawman_wis + 1)),
-            geo_wis = GeoMean(wis / strawman_wis))
-
-all_time_performance %>%
-  pivot_longer(contains("wis")) %>%
-  ggplot(aes(ahead, value, color = forecaster)) +
-  theme_bw() +
-  geom_point() +
-  geom_line() +
-  scale_color_viridis_d() +
-  facet_wrap(~name) +
-  geom_hline(yintercept = 1, color = "red") +
-  theme(legend.position = "bottom", legend.title = element_blank()) +
-  scale_y_log10()
-
-df <- summarizer(fcasts_honest %>% filter(period != "jm") %>%
-                   group_by(ahead, forecaster), "wis", NULL, 
-                 "strawman_wis", Mean, c("aggr","scale"))
-
-GeoMean1 <- function(x) exp(mean(log(x + 1), na.rm = TRUE))
-
-all_time_performance <- all_time_performance %>%
-  filter(forecaster != "COVIDhub-4_week_ensemble")
-
-ggplot(df) +
-  geom_line(aes(ahead, wis, color = forecaster)) + 
-  geom_point(aes(ahead, wis, color = forecaster)) +
-  theme_bw() +
-  geom_hline(yintercept = 1, size = 1.5) +
-  xlab("Days ahead") +
-  ylab("Mean WIS (relative to baseline)") +
-  scale_color_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
-  theme(legend.position = "bottom", legend.title = element_blank()) +
-  geom_line(data = all_time_performance %>% filter(ahead < 4, forecaster != "COVIDhub-ensemble"), 
-            aes(ahead * 7, rel_wis, group = forecaster),
-            color = "grey70") +
-  geom_line(data = all_time_performance %>% filter(ahead < 4, forecaster == "COVIDhub-ensemble"),
-            aes(ahead * 7, rel_wis), color = "lightblue", size = 1.5) +
-  scale_y_log10()
-
-df2 <- fcasts_honest %>% 
-  filter(period != "jm") %>%
-  group_by(ahead, forecaster) %>%
-  summarise(wis = GeoMean((wis + 1) / (strawman_wis + 1)))
-
-
-ggplot(df2) +
-  geom_line(aes(ahead, wis, color = forecaster)) + 
-  geom_point(aes(ahead, wis, color = forecaster)) +
-  theme_bw() +
-  geom_hline(yintercept = 1, size = 1.5) +
-  xlab("Days ahead") +
-  ylab("Geometric mean of WIS (relative to baseline)") +
-  scale_color_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
-  theme(legend.position = "bottom", legend.title = element_blank()) +
-  geom_line(data = all_time_performance %>% filter(ahead < 4, forecaster != "COVIDhub-ensemble"), 
-            aes(ahead * 7, geo_wis1, group = forecaster),
-            color = "grey70") +
-  geom_line(data = all_time_performance %>% filter(ahead < 4, forecaster == "COVIDhub-ensemble"),
-            aes(ahead * 7, geo_wis1), color = "lightblue", size = 1.5) +
-  scale_y_log10()
diff --git a/forecast/code/deprecated/hrr_state_aggr.R b/forecast/code/deprecated/hrr_state_aggr.R
new file mode 100644
index 0000000..4ab073e
--- /dev/null
+++ b/forecast/code/deprecated/hrr_state_aggr.R
@@ -0,0 +1,42 @@
+library(tidyverse)
+fips_hrr <- read_csv(
+  "https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/main/_delphi_utils_python/delphi_utils/data/fips_hrr_table.csv", 
+  col_types = "cid")
+
+hf <- pivot_wider(fips_hrr, names_from = fips, values_from = weight)
+hf <- hf %>% arrange(hrr)
+hf_mat <- hf %>% select(-hrr) %>% as.matrix()
+hf_mat[is.na(hf_mat)] <- 0
+fh_mat <- MASS::ginv(hf_mat)
+hrrs <- hf %>% pull(hrr) 
+colnames(fh_mat) <- hrrs
+fh <- as_tibble(fh_mat)
+fh$geo_value <- colnames(hf_mat) %>% 
+  substr(1, 2) %>% 
+  covidcast::fips_to_abbr() %>% 
+  tolower()
+sh <- fh %>% 
+  group_by(geo_value) %>% 
+  summarise(across(everything(), sum))
+sh_mat <- sh %>% select(-geo_value) %>% as.matrix()
+sh_long <- sh %>% 
+  pivot_longer(-geo_value, names_to = "hrr", values_to = "multiplier") %>%
+  mutate(hrr = as.character(hrr)) %>%
+  arrange(hrr)
+
+
+# Process preds -----------------------------------------------------------
+
+preds <- preds %>% 
+  rename(hrr = geo_value) %>%
+  group_by(forecaster, forecast_date, ahead, quantile) %>%
+  arrange(hrr) #%>%
+  group_modify( ~ {
+    left_join(sh_long, .x, by = "hrr") %>%
+      mutate(value = multiplier * value) %>%
+      group_by(geo_value) %>%
+      summarise(value = sum(value))
+  })
+  
+## Need to re-sort and threshold to 0 afterward
+
diff --git a/forecast/code/deprecated/supplement.Rmd b/forecast/code/deprecated/supplement.Rmd
new file mode 100644
index 0000000..ce8bf90
--- /dev/null
+++ b/forecast/code/deprecated/supplement.Rmd
@@ -0,0 +1,268 @@
+
+\clearpage
+
+<!-- Comparison to submitted forecasts -->
+
+```{r grab-submitted-case-fcasts}
+library(aws.s3)
+Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
+s3bucket <- get_bucket("forecast-eval")
+# ensure that forecasters have submitted a reasonable amount
+n_keeper_weeks <- 6L
+n_keeper_locs <- 50L
+case_scores <- s3readRDS("score_cards_state_cases.rds", s3bucket) %>% 
+  mutate(forecast_date = target_end_date - ahead * 7) %>% 
+  # fix weirdnesses about submission dates, 
+  # this is generous to the forecaster
+  filter(forecast_date < "2021-01-01", ahead < 4) %>% 
+  select(ahead, geo_value, forecaster, target_end_date, wis, forecast_date)
+
+strawman <- case_scores %>% filter(forecaster == "COVIDhub-baseline")
+case_scores <- left_join(
+  case_scores, 
+  strawman %>% 
+    select(forecast_date, target_end_date, geo_value, wis) %>% 
+    rename(strawman_wis = wis)
+) %>% filter(forecaster != "COVIDhub-baseline")
+
+# Subset to those forecasters that submitted enough 14-day ahead forecasts
+n_submitted <- case_scores %>% 
+  filter(ahead == 2) %>% 
+  group_by(forecaster) %>% 
+  summarise(nfcasts = n())
+keepers <- n_submitted %>% 
+  filter(nfcasts / n_keeper_locs > n_keeper_weeks - .0001, 
+         forecaster != "COVIDhub-4_week_ensemble") %>% # same as CH-Ensemble 
+  pull(forecaster)
+
+case_scores <- case_scores %>% 
+  filter(forecaster %in% keepers) %>%
+  group_by(forecaster, ahead) %>%
+  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
+            geo_wis1 = GeoMean((wis + 1) / (strawman_wis + 1)),
+            geo_wis = GeoMean(wis / strawman_wis))
+```
+
+```{r compare-to-hub-mean, fig.height = 8.5, fig.cap="This figure reproduces Figure 3 in the main paper but overlays scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey lines correspond to the various teams that submitted during period our evaluation period. We have highlighted the COVIDhub-ensemble, which is the official forecast of the CDC.", include=FALSE}
+# Same as Fig 3 in the paper
+df <- summarizer(fcasts_honest %>%
+                   group_by(ahead, forecaster), "wis", NULL, 
+                 "strawman_wis", Mean, c("aggr","scale"))
+ggplot(df) +
+  geom_line(aes(ahead, wis, color = forecaster)) + 
+  geom_point(aes(ahead, wis, color = forecaster)) +
+  theme_bw() +
+  geom_hline(yintercept = 1, size = 1.5) +
+  xlab("Days ahead") +
+  ylab("Mean WIS (relative to baseline)") +
+  scale_color_manual(values = c(fcast_colors, "CH-ensemble" = "lightblue"),
+                     guide = guide_legend(nrow = 2)) +
+  theme(legend.position = "bottom", legend.title = element_blank()) +
+  geom_line(data = case_scores, 
+            aes(ahead * 7, rel_wis, group = forecaster),
+            color = "grey70") +
+  geom_line(data = case_scores %>% filter(forecaster == "COVIDhub-ensemble"),
+            aes(ahead * 7, rel_wis), color = "lightblue", size = 1.5) +
+  scale_y_log10()
+```
+
+\clearpage
+
+```{r compare-to-hub-geomean, fig.height = 8.5, fig.cap="This figure is similar to Figure \\ref{fig:fcast-adjusted}. In this case, we add 1 to both the forecaster WIS and the baseline WIS before scaling (to allow forecasters that achieve 0 error to appear), and we overlay scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey lines correspond to the various teams that submitted during period our evaluation period. We have highlighted the COVIDhub-ensemble, which is the official forecast of the CDC.", include=FALSE}
+# Like supplement GeoMean figure, but adding 1
+df2 <- fcasts_honest %>% 
+  group_by(ahead, forecaster) %>%
+  summarise(wis = GeoMean((wis + 1) / (strawman_wis + 1)))
+
+ggplot(df2) +
+  geom_line(aes(ahead, wis, color = forecaster)) + 
+  geom_point(aes(ahead, wis, color = forecaster)) +
+  theme_bw() +
+  geom_hline(yintercept = 1, size = 1.5) +
+  xlab("Days ahead") +
+  ylab("Geometric mean WIS (relative to baseline)") +
+  scale_color_manual(values = c(fcast_colors, "CH-ensemble" = "lightblue"),
+                     guide = guide_legend(nrow = 2)) +
+  theme(legend.position = "bottom", legend.title = element_blank()) +
+  geom_line(data = case_scores, 
+            aes(ahead * 7, geo_wis1, group = forecaster),
+            color = "grey70") +
+  geom_line(data = case_scores %>% filter(forecaster == "COVIDhub-ensemble"),
+            aes(ahead * 7, geo_wis1), color = "lightblue", size = 1.5) +
+  scale_y_log10()
+```
+
+```{r hotspots-upswing-downswing-remake, fig.cap="REMAKE: Classification and loglikelihood separated into periods of upswing, downswing, and flat cases. The indicator assisted models always have lower classification error relative to the null classifier, while the AR actually does worse in down and flat periods. In terms of loglikelihood, all forecasters have lower likelihood than the null classifier during up periods. In down and flat periods, the indicators generally improve over the AR, while the are worse during up periods.", eval=FALSE}
+# mycummean <- function(x) {
+#   isgood <- ! is.na(x)
+#   denom <- cumsum(isgood)
+#   x[!isgood] <- 0
+#   num <- cumsum(x)
+#   y <- num / denom
+#   y[is.na(y)] <- .5 # deal with empties at the beginning
+#   y
+# }
+# 
+# null_classifier <- readRDS(here("data", 
+#                                 "all_signals_wide_as_of_2020-05-18.RDS")) %>%
+#   rename(jhu = `value+0:jhu-csse_confirmed_7dav_incidence_prop`) %>%
+#   select(geo_value, time_value, jhu) %>%
+#   group_by(geo_value) %>%
+#   arrange(time_value) %>%
+#   mutate(lag_value = lag(jhu, 7),
+#          apc = (jhu - lag_value) / lag_value,
+#          hot = case_when(
+#            apc > .25 & lag_value > 30 ~ 1,
+#            apc <= .25 & lag_value > 30 ~ 0),
+#          null_class = mycummean(hot))
+  
+hot_udf <- inner_join(
+  hotspots_honest,
+  up_down %>% select(geo_value, target_end_date, udf))
+
+cutoff <- mean(hot_udf %>% filter(forecaster == "AR") %>% pull(actual))
+
+con_tab <- hot_udf %>% 
+  filter(!is.na(actual)) %>%
+  mutate(pred = value > .5) %>%
+  group_by(forecaster, udf) %>%
+  summarise(m = (mean(pred != actual) - cutoff) / cutoff,
+            err = mean(pred != actual)) %>%
+  ungroup()
+
+# con_tab <- left_join(
+#   con_tab %>% filter(forecaster != "AR"),
+#   con_tab %>% filter(forecaster == "AR") %>%
+#     select(-forecaster) %>%
+#     rename(mar = m))
+
+llike_tab <- hot_udf %>%
+  filter(!is.na(actual)) %>%
+  # left_join(null_classifier %>% select(geo_value, time_value, null_class) %>%
+  #             rename(target_end_date = time_value)) %>%
+  mutate( # kill prob 0-1 predictions
+    value = case_when(
+      value < 1e-8 ~ 1e-8,
+      value > 1-1e-8 ~ 1-1e-8,
+      TRUE ~ value),
+    # null_class = case_when(
+    #   null_class < 1e-8 ~ 1e-8,
+    #   null_class > 1-1e-8 ~ 1-1e-8,
+    #   TRUE ~ null_class),
+    llike = (actual == 1) * log(value) + (actual == 0) * log(1 - value),
+    nulldev = (actual == 1) * log(cutoff) + 
+      (actual == 0) * log(1 - cutoff)) %>%
+  group_by(forecaster, udf) %>%
+  summarise(m = (mean(llike) - mean(nulldev)) / abs(mean(nulldev)),
+            mm = mean(llike),
+            mmm = mean(nulldev)) %>%
+  ungroup()
+
+# llike_tab <- left_join(
+#   llike_tab %>% filter(forecaster != "AR"),
+#   llike_tab %>% filter(forecaster == "AR") %>%
+#     select(-forecaster) %>%
+#     rename(mar = m))
+
+bind_rows(con_tab %>% mutate(err = "Classification error"), 
+          llike_tab %>% mutate(err = "Log likelihood")) %>%
+  ggplot(aes(udf, m , fill = forecaster)) +
+  geom_bar(width = 0.6, position = position_dodge(width=0.6),
+           stat = "identity") +
+  scale_fill_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
+  #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+  theme_bw() +
+  geom_hline(yintercept = 0) +
+  scale_y_continuous(labels = scales::percent_format()) +
+  facet_wrap(~err, scales = "free_y") +
+  ylab("Improvement relative to the null classifier") +
+  xlab("") +
+  theme(legend.position = "bottom", legend.title = element_blank())
+```
+
+```{r cor-wis-ratio-m1, eval = FALSE, fig.cap="This is Alden's second set of histograms. Here we have the correlation of the absolute value of WIS ratio - 1 with the percent change in 7dav cases relative to 7 days earlier"}
+ggplot(pct_chng_df %>% 
+         group_by(forecaster) %>% 
+         mutate(median = median(cor_abs_wisratio_minus_1_pctchange)), 
+       aes(x = cor_abs_wisratio_minus_1_pctchange)) +
+  geom_histogram(aes(y = ..count.. / sum(..count..), fill = forecaster), bins = 50) +
+  scale_fill_manual(values = fcast_colors) +
+  theme_bw() +
+  theme(legend.position = "none") +
+  geom_vline(aes(xintercept = median), linetype = "dotted") + 
+  geom_vline(xintercept = 0) +
+  facet_wrap(~forecaster) + 
+  xlab("Spearman correlation") + ylab("Relative frequency") +
+  scale_y_continuous(labels = scales::label_percent(accuracy = 1))
+```
+
+```{r upswing-histogram-logged, eval = FALSE, fig.cap="Not sure if we want this here. Similar to Figure 5 in the manuscript but taking logs. "}
+up_down_df %>% 
+  group_by(forecaster, udf) %>%
+  ggplot(aes((AR_wis + 1) / (wis + 1), fill = forecaster)) +
+  geom_histogram(bins = 100) +
+  facet_grid(udf ~ forecaster) +
+  theme_bw() +
+  ylab("Count") +
+  theme(legend.position = "none") +
+  scale_fill_manual(values = fcast_colors) +
+  scale_x_log10() +
+  scale_y_log10(breaks = c(10,1000,100000), 
+                labels = trans_format("log10", math_format(10^.x))) +
+  xlab("AR WIS - forecaster WIS") +
+  geom_vline(xintercept = 1)
+```
+
+```{r leading-and-lagging, eval=FALSE}
+# Deprecated
+df2 %>%
+  group_by(forecaster, udf) %>%
+  summarise(
+    leadingness = cor(AR_wis - wis, leading, use = "pairwise.complete.obs"),
+    laggingness = cor(AR_wis - wis, lagging, use = "pairwise.complete.obs")) %>%
+  pivot_longer(leadingness:laggingness) %>%
+  ggplot(aes(udf, value, fill = forecaster)) +
+  geom_bar(width = 0.6, position = position_dodge(width=0.6),
+           stat = "identity") +
+  geom_hline(yintercept = 0) +
+  scale_fill_manual(values = fcast_colors) +
+  facet_wrap(~name) +
+  theme_bw() +
+  ylab("Correlation") +
+  xlab("") +
+  theme(legend.position = "bottom", legend.title = element_blank())
+```
+
+
+```{r fcast-gs-locations, eval=FALSE}
+plotter(fcasts_gs %>% filter(period != "jm"),
+        "wis", Mean, scaler = "strawman_wis", 
+        order_of_operations = c("aggr","scale")) +
+  ylab("Mean WIS (relative to baseline)")
+```
+
+```{r fcast-gs-locations-adjusted, eval=FALSE}
+plotter(fcasts_gs %>% filter(period != "jm"), 
+        "wis", GeoMean, scaler = "strawman_wis", 
+        order_of_operations = c("scale","aggr")) +
+  ylab("Geometric mean WIS (relative to baseline)")
+```
+
+```{r hot-gs-locations, eval=FALSE}
+plotter_hotspots(hotspots_gs %>% filter(period != "jm")) +
+  geom_hline(yintercept = 0.5)
+```
+
+```{r traj-data, eval = FALSE}
+source(here("code", "deprecated", "trajectory_plot_funs.R"))
+preds <- readRDS(here("data", "predictions_honest.RDS"))
+traj_best <- get_trajectory_plots(fcasts_honest, preds, actuals, hrr_tab, 
+                                  "only2020", "best")
+traj_worst <- get_trajectory_plots(fcasts_honest, preds, actuals, hrr_tab,
+                                   "only2020", "worst")
+rm(preds)
+```
+
+```{r trajectory-plots, eval = FALSE}
+for (nm in names(traj_best)) print(trajectory_panel(nm, traj_best, traj_worst))
+```
\ No newline at end of file
diff --git a/forecast/code/figures/ccf-dv-finalized.R b/forecast/code/figures/ccf-dv-finalized.R
index f26faef..5267d6f 100644
--- a/forecast/code/figures/ccf-dv-finalized.R
+++ b/forecast/code/figures/ccf-dv-finalized.R
@@ -24,7 +24,7 @@ g1 <- std_sigs %>%
   geom_line() +
   theme_bw() +
   ylab("Standardized signal") +
-  xlab("") +
+  xlab("Date") +
   scale_color_viridis_d(begin=.25, end=.75, name = "") +
   theme(legend.position = "bottom")
 
@@ -36,7 +36,7 @@ g2 <- ggplot(tibble(lag = drop(cc$lag), ccf = drop(cc$acf)), aes(lag)) +
   geom_hline(yintercept = 0) +
   geom_vline(xintercept = 0) +
   ylab("Cross correlation") +
-  xlab("Lag (a)") +
+  xlab("Shift value (a)") +
   theme_bw() +
   theme(legend.position = "none")
 
diff --git a/forecast/code/figures/compare-to-hub-ryan.Rmd b/forecast/code/figures/compare-to-hub-ryan.Rmd
new file mode 100644
index 0000000..a5cc8e5
--- /dev/null
+++ b/forecast/code/figures/compare-to-hub-ryan.Rmd
@@ -0,0 +1,202 @@
+
+# Import from the Hub -----------------------------------------------------
+```{r eval = FALSE}
+library(tidyverse)
+library(aws.s3)
+Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
+s3bucket <- get_bucket("forecast-eval")
+n_keeper_weeks <- 6L
+n_keeper_locs <- 50L
+# case_scores <- s3readRDS("score_cards_state_cases.rds", s3bucket) 
+case_preds <- s3readRDS("predictions_cards.rds", s3bucket) %>%
+  filter(signal == "confirmed_incidence_num",
+         forecast_date < "2021-01-01",
+         !(geo_value %in% c("as", "gu", "pr", "vi", "mp", "us"))) %>%
+  select(-data_source, -signal, -incidence_period)
+actuals <- covidcast::covidcast_signal("jhu-csse", "confirmed_7dav_incidence_num",
+                                       geo_type = "state", as_of = "2021-05-18",
+                                       end_day = "2021-03-01") %>%
+  select(geo_value, time_value, value) %>%
+  mutate(value = value * 7) %>%
+  rename(target_end_date = time_value, actual = value)
+case_scores <- evalcast::evaluate_predictions(
+  case_preds, actuals, 
+  err_measures = list(wis = evalcast::weighted_interval_score),
+  grp_vars = c("ahead", "forecaster", "forecast_date", "geo_value")
+)
+rm(case_preds)
+case_scores <- case_scores %>% 
+  mutate(
+    ahead = ahead * 7 - 2, # forecast on a Tuesday
+    forecast_date = target_end_date - ahead) %>% 
+  # fix weirdnesses about submission dates
+  filter(forecast_date < "2021-01-01") %>%
+  select(ahead, geo_value, forecaster, target_end_date, wis, forecast_date)
+
+strawman <- case_scores %>% filter(forecaster == "COVIDhub-baseline")
+case_scores <- left_join(
+  case_scores, 
+  strawman %>% 
+    select(forecast_date, target_end_date, geo_value, wis) %>% 
+    rename(strawman_wis = wis)
+  )
+case_scores <- case_scores %>% 
+  filter(forecaster != "COVIDhub-baseline")
+aheads <- case_scores %>% distinct(ahead) %>% pull()
+n_submitted <- case_scores %>% 
+  filter(ahead == aheads[2]) %>% 
+  group_by(forecaster) %>% 
+  summarise(nfcasts = n())
+keepers <- n_submitted %>% 
+  filter(nfcasts / n_keeper_locs > n_keeper_weeks - .0001) %>% 
+  # submitted at least z weeks for x locations
+  pull(forecaster)
+
+case_scores <- case_scores %>% filter(forecaster %in% keepers)
+```
+
+# Load our models ---------------------------------------------------------
+```{r}
+library(tidyverse)
+ours <- readRDS(here::here("data", "results_honest_states.RDS")) %>%
+  filter(forecaster != "AR3GSSAA3_Subset")
+pop <- covidcast::state_census %>% select(ABBR, POPESTIMATE2019) %>%
+  mutate(geo_value = tolower(ABBR)) %>% select(-ABBR)
+# scale from prop to num
+ours2 <- left_join(ours, pop, by = "geo_value") %>%
+  mutate(wis = wis, #* POPESTIMATE2019, #/ 1e5 * 7,
+         forecaster = recode(forecaster,
+                             AR3 = "AR",
+                             AR3CHCLI3 = "CHNG-CLI",
+                             AR3CHCOV3 = "CHNG-COVID",
+                             AR3DVCLI3 = "DV-CLI",
+                             AR3FBCLI3 = "CTIS-CLIIC",
+                             AR3GSSAA3_Zero = "Google-AA")) %>%
+  select(-ae, -POPESTIMATE2019) 
+
+# Join sanity check
+nrow(ours) == nrow(ours2)
+ours = ours2
+```
+
+```{r}
+our_models <- ours %>% filter(forecaster != "Baseline")
+baseline <- ours %>% filter(forecaster == "Baseline") %>%
+  select(-forecaster) %>% rename(strawman_wis = wis)
+our_models <- left_join(our_models, baseline)
+
+# Sanity check: left join
+nrow(our_models) == nrow(ours) - sum(ours$forecaster == "Baseline")
+
+# Sanity check: missingness
+our_models %>%
+  group_by(forecaster) %>%
+  summarize(sum(is.na(wis)))
+
+our_models %>% summarize(sum(is.na(strawman_wis)))
+
+Mean <- function(x) mean(x, na.rm = TRUE)
+GeoMean <- function(x) exp(mean(log(x), na.rm = TRUE))
+
+all_models = our_models
+all_time_performance <- all_models %>%
+  #filter(forecast_date %in% common_fd) %>%
+  group_by(forecaster, ahead) %>% #, source) %>%
+  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
+            geo_wis = GeoMean((wis) / (strawman_wis))) %>%
+  pivot_longer(contains("wis"))
+
+facet_labs <- c(geo_wis = "Geometric mean of WIS", rel_wis = "Mean of WIS")
+fcast_colors <- c(RColorBrewer::brewer.pal(5, "Set1"), "#000000")
+names(fcast_colors) <- c("CHNG-CLI", "CHNG-COVID", "CTIS-CLIIC", "DV-CLI",
+                         "Google-AA", "AR")
+
+
+ggplot(all_time_performance %>% filter(ahead <= 21), 
+       aes(ahead, value, color = forecaster)) +
+  geom_line(aes(ahead, value, color = forecaster)) + 
+  geom_point(aes(ahead, value, color = forecaster)) +
+  scale_color_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
+  theme_bw() +
+  ylab("Score relative to baseline") +
+  geom_hline(yintercept = 1, size = 1.5) +
+  xlab("Days ahead") +
+  facet_wrap(~ name, labeller = labeller(name = facet_labs), scales = "free_y") +
+  theme(legend.position = "bottom", legend.title = element_blank())
+```
+
+```{r}
+p1 <- ours %>% 
+  group_by(forecaster, ahead) %>%
+  summarize(mean_wis = Mean(wis)) %>%
+  ggplot(aes(x = ahead, y = mean_wis, color = forecaster)) + 
+  geom_line() + geom_point() +   
+  xlab("Days ahead") + ylab("Mean WIS") + 
+  scale_color_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
+  theme_bw() +
+  theme(legend.position = "bottom", legend.title = element_blank())
+  
+p2 <- ours %>% 
+  group_by(forecaster, ahead) %>%
+  summarize(geomean_wis = GeoMean(wis)) %>%
+  ggplot(aes(x = ahead, y = geomean_wis, color = forecaster)) + 
+  geom_line() + geom_point() +   
+  xlab("Days ahead") + ylab("Geo mean WIS") + 
+  scale_color_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
+  theme_bw() +
+  theme(legend.position = "bottom", legend.title = element_blank())
+
+gridExtra::grid.arrange(p2, p1, nrow = 1)
+```
+
+# Try with other code
+```{r}  
+library(tidyverse)
+library(lubridate)
+source(here::here("code", "figures", "results_processing_and_plotting.R"))
+source(here::here("code", "figures", "eval_funs.R"))
+ours3 <- readRDS(here::here("data", "results_honest_states.RDS")) %>%
+  process_res_cases(actuals) %>%
+  filter(forecaster != "gs_subset") %>%
+  filter(forecaster != "gs_inherit") %>%
+  mutate(forecaster = recode(forecaster, gs = "Google-AA"))
+
+unique(ours3$forecaster)[!(unique(ours3$forecaster) %in% names(fcast_colors))]
+
+p1 <- plotter(ours3 %>% filter(ahead <= 21), 
+        "wis", Mean, scaler = "strawman_wis", 
+        order_of_operations = c("aggr","scale")) +
+  ylab("Mean WIS (relative to baseline)")
+
+p2 <- plotter(ours3 %>% filter(ahead <= 21), 
+        "wis", GeoMean, scaler = "strawman_wis", 
+        order_of_operations = c("aggr","scale")) +
+  ylab("Geometric mean WIS (relative to baseline)")
+
+gridExtra::grid.arrange(p2, p1, nrow = 1)
+```
+
+# Load our HRR results ---------------------------------------------------------
+```{r eval = FALSE}
+source(here::here("code", "figures", "results_processing_and_plotting.R"))
+source(here::here("code", "figures", "eval_funs.R"))
+
+actuals <- readRDS(here::here("data", "confirmed_7dav_incidence_prop.RDS"))
+fcasts_honest <- readRDS(here::here("data", "results_honest.RDS")) 
+  process_res_cases(actuals) %>%
+  filter(forecaster != "gs_subset") %>%
+  filter(forecaster != "gs_inherit") %>%
+  mutate(forecaster = recode(forecaster, gs = "Google-AA"))
+
+p1 <- plotter(ours3 %>% filter(ahead <= 21), 
+        "wis", Mean, scaler = "strawman_wis", 
+        order_of_operations = c("aggr","scale")) +
+  ylab("Mean WIS (relative to baseline)")
+
+p2 <- plotter(ours3 %>% filter(ahead <= 21), 
+        "wis", GeoMean, scaler = "strawman_wis", 
+        order_of_operations = c("aggr","scale")) +
+  ylab("Geometric mean WIS (relative to baseline)")
+
+gridExtra::grid.arrange(p2, p1, nrow = 1)
+```
diff --git a/forecast/code/figures/compare-to-hub.R b/forecast/code/figures/compare-to-hub.R
new file mode 100644
index 0000000..7103cb0
--- /dev/null
+++ b/forecast/code/figures/compare-to-hub.R
@@ -0,0 +1,134 @@
+
+# Import from the Hub -----------------------------------------------------
+
+library(tidyverse)
+library(aws.s3)
+Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
+s3bucket <- get_bucket("forecast-eval")
+n_keeper_weeks <- 6L
+n_keeper_locs <- 50L
+
+case_preds <- s3readRDS("predictions_cards.rds", s3bucket) %>%
+  filter(signal == "confirmed_incidence_num",
+         forecast_date < "2021-01-01",
+         !(geo_value %in% c("as", "gu", "pr", "vi", "mp", "us")))
+actuals <- covidcast::covidcast_signal("jhu-csse", "confirmed_7dav_incidence_num",
+                                       geo_type = "state", as_of = "2021-05-18",
+                                       end_day = "2021-03-01") %>%
+  mutate(value = value * 7) %>%
+  rename(target_end_date = time_value, actual = value) %>%
+  select(geo_value, target_end_date, actual)
+case_scores <- evalcast::evaluate_predictions(
+  case_preds, actuals,
+  err_measures = list(wis = evalcast::weighted_interval_score),
+  grp_vars = c("ahead", "forecaster", "forecast_date", "geo_value")
+)
+rm(case_preds)
+
+
+case_scores <- case_scores %>% 
+  mutate(
+    ahead = ahead * 7 - 2, # forecast on a Tuesday
+    forecast_date = target_end_date - ahead) %>% 
+  # fix weirdnesses about submission dates
+  filter(forecast_date < "2021-01-01") %>%
+  select(ahead, geo_value, forecaster, target_end_date, wis, forecast_date)
+
+strawman <- case_scores %>% filter(forecaster == "COVIDhub-baseline")
+case_scores <- left_join(
+  case_scores, 
+  strawman %>% 
+    select(forecast_date, target_end_date, geo_value, wis) %>% 
+    rename(strawman_wis = wis)
+  )
+case_scores <- case_scores %>% 
+  filter(forecaster != "COVIDhub-baseline")
+aheads <- case_scores %>% distinct(ahead) %>% pull()
+n_submitted <- case_scores %>% 
+  filter(ahead == aheads[2]) %>% 
+  group_by(forecaster) %>% 
+  summarise(nfcasts = n())
+keepers <- n_submitted %>% 
+  filter(nfcasts / n_keeper_locs > n_keeper_weeks - .0001) %>% 
+  # submitted at least z weeks for x locations
+  pull(forecaster)
+
+case_scores <- case_scores %>% filter(forecaster %in% keepers)
+case_scores <- case_scores %>% mutate(
+  forecaster = recode(forecaster, `COVIDhub-ensemble` = "Ensemble"))
+
+# Load our models ---------------------------------------------------------
+
+ours <- readRDS(here::here("data", "results_honest_states.RDS"))
+pop <- covidcast::state_census %>% select(ABBR, POPESTIMATE2019) %>%
+  mutate(geo_value = tolower(ABBR)) %>% select(-ABBR)
+# scale from prop to num
+ours <- left_join(ours, pop) %>%
+  mutate(wis = wis * POPESTIMATE2019 / 1e5 * 7,
+         forecaster = recode(forecaster,
+                             AR3 = "AR",
+                             AR3CHCLI3 = "CHNG-CLI",
+                             AR3CHCOV3 = "CHNG-COVID",
+                             AR3DVCLI3 = "DV-CLI",
+                             AR3FBCLI3 = "CTIS-CLIIC",
+                             AR3GSSAA3_Subset = "drop",
+                             AR3GSSAA3_Zero = "Google-AA")) %>%
+  filter(forecaster != "drop") %>%
+  select(-ae, -POPESTIMATE2019) 
+
+common_fd <- as.Date(intersect(
+  case_scores %>% select(forecast_date) %>% distinct() %>% pull(),
+  ours %>% select(forecast_date) %>% distinct() %>% pull()),
+  "1970-01-01")
+
+our_models <- ours %>% filter(forecaster != "Baseline")
+baseline <- ours %>% filter(forecaster == "Baseline") %>%
+  select(-forecaster) %>% rename(strawman_wis = wis)
+our_models <- left_join(our_models, baseline)
+all_models <- bind_rows(hub = case_scores, ours = our_models, .id = "source") 
+
+Mean <- function(x) mean(x, na.rm = TRUE)
+GeoMean <- function(x) exp(mean(log(x), na.rm = TRUE))
+
+all_time_performance <- all_models %>%
+  filter(forecast_date %in% common_fd) %>%
+  group_by(forecaster, ahead, source) %>%
+  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
+            geo_wis = GeoMean(wis) / GeoMean(strawman_wis)) %>%
+  pivot_longer(contains("wis"))
+
+our_performance <- our_models %>% 
+  group_by(forecaster, ahead) %>%
+  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
+            geo_wis = GeoMean(wis) / GeoMean(strawman_wis)) %>%
+  pivot_longer(contains("wis"))
+
+facet_labs <- c(geo_wis = "Geometric mean", rel_wis = "Mean")
+fcast_colors <- c("#000000", RColorBrewer::brewer.pal(5, "Set1"))
+names(fcast_colors) <- c("AR", "CHNG-CLI", "CHNG-COVID", "CTIS-CLIIC", "DV-CLI",
+                         "Google-AA")
+       
+ggplot(all_time_performance %>% filter(source == "ours"),
+       aes(ahead, value, color = forecaster)) +
+  geom_line(data = all_time_performance %>%
+              filter(source == "hub", !(forecaster %in% c("Ensemble", 
+                                         "OliverWyman-Navigator"))), 
+            # Data bug? OliverWyman has GeoMean(RelWis) = 0 at ahead = 5
+            aes(group = forecaster),
+            color = "grey80") +
+  geom_line(data = all_time_performance %>% filter(forecaster == "Ensemble"),
+            color = "lightblue", size = 1.5) +
+  geom_point(data = all_time_performance %>% filter(forecaster == "Ensemble"),
+            color = "lightblue", size = 1.5) +
+  geom_line(aes(ahead, value, color = forecaster)) +
+  geom_point(aes(ahead, value, color = forecaster)) +
+  scale_color_manual(values = c(fcast_colors, "Ensemble" = "lightblue"),
+                     guide = guide_legend(nrow = 2)) +
+  ylab("WIS (relative to baseline)") +
+  geom_hline(yintercept = 1, linetype = "dashed") +
+  xlab("Days ahead") +
+  facet_wrap(~ name, labeller = labeller(name = facet_labs)) +
+  theme_bw() + theme(legend.pos = "bottom", legend.title = element_blank())
+
+ggsave(here::here("paper", "fig", "compare-states-to-hub.pdf"), 
+       width = 6.5, height = 4.5)
diff --git a/forecast/code/figures/eval_funs.R b/forecast/code/figures/eval_funs.R
index 25d8efb..8c17228 100644
--- a/forecast/code/figures/eval_funs.R
+++ b/forecast/code/figures/eval_funs.R
@@ -1,7 +1,6 @@
-fcast_colors <- c(RColorBrewer::brewer.pal(5, "Set1"), "#000000")
-names(fcast_colors) <- c("CHNG-CLI", "CHNG-COVID", "CTIS-CLIIC", "DV-CLI",
-                         "Google-AA", "AR")
-
+fcast_colors <- c("#000000", RColorBrewer::brewer.pal(5, "Set1"))
+names(fcast_colors) <- c("AR", "CHNG-CLI", "CHNG-COVID", "CTIS-CLIIC", "DV-CLI",
+                         "Google-AA")
 
 Mean <- function(x) mean(x, na.rm = TRUE)
 Median <- function(x) median(x, na.rm = TRUE)
@@ -11,7 +10,6 @@ TrimMean05 <- function(x) mean(x, trim = .05, na.rm = TRUE)
 TrimMean1 <- function(x) mean(x, trim = .1, na.rm = TRUE)
 GeoMean <- function(x) exp(mean(log(x), na.rm = TRUE))
 
-
 pct_change <- function(x, n = 14, col_name = "pct_change"){
   # courtesy of Alden
   dt <- c(0,-n)
diff --git a/forecast/paper/fig/ccf-dv-finalized-1.pdf b/forecast/paper/fig/ccf-dv-finalized-1.pdf
index 45cab91..3a15b7f 100644
Binary files a/forecast/paper/fig/ccf-dv-finalized-1.pdf and b/forecast/paper/fig/ccf-dv-finalized-1.pdf differ
diff --git a/forecast/paper/fig/compare-states-to-hub.pdf b/forecast/paper/fig/compare-states-to-hub.pdf
new file mode 100644
index 0000000..6d97acf
Binary files /dev/null and b/forecast/paper/fig/compare-states-to-hub.pdf differ
diff --git a/forecast/paper/fig/cor-wis-ratio-1.pdf b/forecast/paper/fig/cor-wis-ratio-1.pdf
index 5037fcf..1705586 100644
Binary files a/forecast/paper/fig/cor-wis-ratio-1.pdf and b/forecast/paper/fig/cor-wis-ratio-1.pdf differ
diff --git a/forecast/paper/fig/cumulative-mean-1.pdf b/forecast/paper/fig/cumulative-mean-1.pdf
index 1c0cf16..167a193 100644
Binary files a/forecast/paper/fig/cumulative-mean-1.pdf and b/forecast/paper/fig/cumulative-mean-1.pdf differ
diff --git a/forecast/paper/fig/diff-in-lead-lag-1.pdf b/forecast/paper/fig/diff-in-lead-lag-1.pdf
index f0b4709..730b14d 100644
Binary files a/forecast/paper/fig/diff-in-lead-lag-1.pdf and b/forecast/paper/fig/diff-in-lead-lag-1.pdf differ
diff --git a/forecast/paper/fig/errs-in-space-1.pdf b/forecast/paper/fig/errs-in-space-1.pdf
index c77863a..9dfe6e1 100644
Binary files a/forecast/paper/fig/errs-in-space-1.pdf and b/forecast/paper/fig/errs-in-space-1.pdf differ
diff --git a/forecast/paper/fig/fcast-1.pdf b/forecast/paper/fig/fcast-1.pdf
index c272176..aa16bbf 100644
Binary files a/forecast/paper/fig/fcast-1.pdf and b/forecast/paper/fig/fcast-1.pdf differ
diff --git a/forecast/paper/fig/fcast-adjusted-1.pdf b/forecast/paper/fig/fcast-adjusted-1.pdf
index 254f3d1..e95be47 100644
Binary files a/forecast/paper/fig/fcast-adjusted-1.pdf and b/forecast/paper/fig/fcast-adjusted-1.pdf differ
diff --git a/forecast/paper/fig/fcast-booted-1.pdf b/forecast/paper/fig/fcast-booted-1.pdf
index b513a62..d738d92 100644
Binary files a/forecast/paper/fig/fcast-booted-1.pdf and b/forecast/paper/fig/fcast-booted-1.pdf differ
diff --git a/forecast/paper/fig/fcast-booted-adjusted-1.pdf b/forecast/paper/fig/fcast-booted-adjusted-1.pdf
index 44c642b..8add33f 100644
Binary files a/forecast/paper/fig/fcast-booted-adjusted-1.pdf and b/forecast/paper/fig/fcast-booted-adjusted-1.pdf differ
diff --git a/forecast/paper/fig/fcast-finalized-1.pdf b/forecast/paper/fig/fcast-finalized-1.pdf
index 780c92d..7732ae4 100644
Binary files a/forecast/paper/fig/fcast-finalized-1.pdf and b/forecast/paper/fig/fcast-finalized-1.pdf differ
diff --git a/forecast/paper/fig/fcast-honest-v-finalized-1.pdf b/forecast/paper/fig/fcast-honest-v-finalized-1.pdf
index 5d86bff..ad764c0 100644
Binary files a/forecast/paper/fig/fcast-honest-v-finalized-1.pdf and b/forecast/paper/fig/fcast-honest-v-finalized-1.pdf differ
diff --git a/forecast/paper/fig/fcast-hot-combo-1.pdf b/forecast/paper/fig/fcast-hot-combo-1.pdf
index e6d8bb8..774d8b1 100644
Binary files a/forecast/paper/fig/fcast-hot-combo-1.pdf and b/forecast/paper/fig/fcast-hot-combo-1.pdf differ
diff --git a/forecast/paper/fig/hot-1.pdf b/forecast/paper/fig/hot-1.pdf
index a299e9a..39460f9 100644
Binary files a/forecast/paper/fig/hot-1.pdf and b/forecast/paper/fig/hot-1.pdf differ
diff --git a/forecast/paper/fig/hot-booted-1.pdf b/forecast/paper/fig/hot-booted-1.pdf
index dd2d80b..7ec60be 100644
Binary files a/forecast/paper/fig/hot-booted-1.pdf and b/forecast/paper/fig/hot-booted-1.pdf differ
diff --git a/forecast/paper/fig/hot-finalized-1.pdf b/forecast/paper/fig/hot-finalized-1.pdf
index 0a4e3a8..b9efc3f 100644
Binary files a/forecast/paper/fig/hot-finalized-1.pdf and b/forecast/paper/fig/hot-finalized-1.pdf differ
diff --git a/forecast/paper/fig/hot-honest-v-finalized-1.pdf b/forecast/paper/fig/hot-honest-v-finalized-1.pdf
index 6e55878..e816480 100644
Binary files a/forecast/paper/fig/hot-honest-v-finalized-1.pdf and b/forecast/paper/fig/hot-honest-v-finalized-1.pdf differ
diff --git a/forecast/paper/fig/hotspots-upswing-downswing-1.pdf b/forecast/paper/fig/hotspots-upswing-downswing-1.pdf
index e61beb1..87c9a5e 100644
Binary files a/forecast/paper/fig/hotspots-upswing-downswing-1.pdf and b/forecast/paper/fig/hotspots-upswing-downswing-1.pdf differ
diff --git a/forecast/paper/fig/lagging-only-1.pdf b/forecast/paper/fig/lagging-only-1.pdf
index f633c73..7f8b938 100644
Binary files a/forecast/paper/fig/lagging-only-1.pdf and b/forecast/paper/fig/lagging-only-1.pdf differ
diff --git a/forecast/paper/fig/leading-only-1.pdf b/forecast/paper/fig/leading-only-1.pdf
index a6ee848..8a1a680 100644
Binary files a/forecast/paper/fig/leading-only-1.pdf and b/forecast/paper/fig/leading-only-1.pdf differ
diff --git a/forecast/paper/fig/sign-test-1.pdf b/forecast/paper/fig/sign-test-1.pdf
index 5eb0a06..56c8e37 100644
Binary files a/forecast/paper/fig/sign-test-1.pdf and b/forecast/paper/fig/sign-test-1.pdf differ
diff --git a/forecast/paper/fig/state-level-performance.pdf b/forecast/paper/fig/state-level-performance.pdf
new file mode 100644
index 0000000..6544894
Binary files /dev/null and b/forecast/paper/fig/state-level-performance.pdf differ
diff --git a/forecast/paper/fig/upswing-corr-table-1.pdf b/forecast/paper/fig/upswing-corr-table-1.pdf
index 0264408..e158944 100644
Binary files a/forecast/paper/fig/upswing-corr-table-1.pdf and b/forecast/paper/fig/upswing-corr-table-1.pdf differ
diff --git a/forecast/paper/fig/upswing-histogram-1.pdf b/forecast/paper/fig/upswing-histogram-1.pdf
index 233b42b..facda1c 100644
Binary files a/forecast/paper/fig/upswing-histogram-1.pdf and b/forecast/paper/fig/upswing-histogram-1.pdf differ
diff --git a/forecast/paper/fig/upswing-summary-1.pdf b/forecast/paper/fig/upswing-summary-1.pdf
index d78d46a..96ba57e 100644
Binary files a/forecast/paper/fig/upswing-summary-1.pdf and b/forecast/paper/fig/upswing-summary-1.pdf differ
diff --git a/forecast/paper/fig/upswing-summary-remake-1.pdf b/forecast/paper/fig/upswing-summary-remake-1.pdf
index 5a3ea76..07f0fe0 100644
Binary files a/forecast/paper/fig/upswing-summary-remake-1.pdf and b/forecast/paper/fig/upswing-summary-remake-1.pdf differ
diff --git a/forecast/paper/fig/wis-densities-1.pdf b/forecast/paper/fig/wis-densities-1.pdf
index 8f5a8b2..0ed2b51 100644
Binary files a/forecast/paper/fig/wis-densities-1.pdf and b/forecast/paper/fig/wis-densities-1.pdf differ
diff --git a/forecast/paper/paper.pdf b/forecast/paper/paper.pdf
new file mode 100644
index 0000000..902ee6e
Binary files /dev/null and b/forecast/paper/paper.pdf differ
diff --git a/forecast/paper/paper.tex b/forecast/paper/paper.tex
index 236388e..1da7f33 100644
--- a/forecast/paper/paper.tex
+++ b/forecast/paper/paper.tex
@@ -47,18 +47,18 @@
 \leadauthor{McDonald} 
 
 % Please add a significance statement to explain the relevance of your work
-\significancestatement{Forecasting is a critical element in the public health
-  response to any fast-moving epidemic or pandemic.  Predicting the future
-  spread of some disease process, based on disease activity in the recent
-  past---say, using autoregressive modeling from the time series literature---is 
-  a basic and widely-used method.  In this paper we study whether including
-  auxiliary indicators as additional features in such an autoregressive model
-  can be useful in COVID-19 case forecasting and hotspot prediction.  We find
-  that indicators based on de-identified medical insurance claims, self-reported
-  symptoms through online surveys, and COVID-related internet search activity
-  all provide a nontrivial improvement in forecast accuracy, though their
-  utility depends on the pandemic's local dynamics (upswing, downswing, or flat
-  period) at prediction time. 
+\significancestatement{Validated forecasting methodology should be a vital  
+  element in the public health response to any fast-moving epidemic or 
+  pandemic. A widely-used model for predicting the future spread of a temporal
+  process, based on past data, is an autoregressive (AR) model. While basic,
+  such an AR model (properly trained) is already competitive with the top models
+  in operational use for COVID-19 forecasting. In this paper, we exhibit five
+  auxiliary indicators---based on de-identified medical insurance claims,
+  self-reported symptoms via online surveys, and COVID-related Google
+  searches---that further improve the predictive accuracy of an AR model in
+  COVID-19 forecasting. The most substantial gains appear to be in quiescent   
+  times; but the Google search indicator appears to also offer improvements
+  during upswings in pandemic activity.   
 }
 
 % Please include corresponding author, author contribution and author
@@ -80,27 +80,24 @@
   digital surveillance} 
 
 \begin{abstract}
-  Reliable, short-term forecasts of traditional public health reporting streams
-  (such as cases, hospitalizations, and deaths) are a key ingredient in
-  effective public health decision-making during a pandemic. Since April 2020,
-  our research group has worked with data partners to collect, curate, and make
-  publicly available numerous real-time COVID-19 indicators, providing multiple
-  views of pandemic activity.  This paper studies the utility of these
-  indicators from a forecasting perspective. We focus on five indicators,
-  derived from medical insurance claims data, web search queries, and online
-  survey responses. For each indicator, we ask whether its inclusion in a simple
-  model leads to improved predictive accuracy relative to a similar model
-  excluding it. We consider both probabilistic forecasting of confirmed COVID-19 
-  case rates and binary prediction of case ``hotspots''.  Since the values of 
-  indicators (and case rates) are commonly revised over time, we take special
-  care to ensure that the data provided to a forecaster is the version that
-  would have been available at the time the forecast was made.   
-  % We demonstrate how less careful backtesting can lead to misleading
-  % retrospective evaluations.   
-  Our  analysis shows that consistent but modest gains in predictive accuracy 
-  are obtained by using these indicators, and furthermore, these gains are
-  related to periods in which the auxiliary indicators behave as ``leading
-  indicators'' of case rates.     
+  Short-term forecasts of traditional streams from public health reporting (such 
+  as cases, hospitalizations, and deaths) are a key input to public health
+  decision-making during a pandemic. Since early 2020, our research group has
+  worked with data partners to collect, curate, and make publicly available
+  numerous real-time COVID-19 indicators, providing multiple views of pandemic
+  activity in the U.S.  This paper studies the utility of five such
+  indicators---derived from de-identified medical insurance claims,
+  self-reported symptoms from online surveys, and COVID-related Google search
+  activity---from a forecasting perspective. For each indicator, we ask whether
+  its inclusion in an autoregressive (AR) model leads to improved predictive
+  accuracy relative to the same model excluding it. Such an AR model, without
+  external features, is already competitive with many top COVID-19 forecasting
+  models in use today. Our analysis reveals that (a) inclusion of each of these
+  five indicators improves on the overall predictive accuracy of the AR model;
+  (b) predictive gains are in general most pronounced during times in which
+  COVID cases are trending in ``flat'' or ``down'' directions; (c) one
+  indicator, based on Google searches, seems to be particularly helpful
+  during ``up''  trends.      
 \end{abstract}
 
 \dates{This manuscript was compiled on \today}
@@ -114,78 +111,95 @@
 
 \dropcap{T}racking and forecasting indicators from public health reporting
 streams---such as confirmed cases and deaths in the COVID-19 pandemic---is
-crucial for understanding disease spread, formulating public policy responses,
-and anticipating future public health resource needs.  A companion paper
-\cite{Reinhart:2021} describes our research group's (Delphi's) efforts in
-curating and maintaining a database of real-time indicators that track COVID-19
-activity and other relevant phenomena. The signals (a term we use synonomously
-with ``indicators'') in this database are accessible through the COVIDcast API
-\cite{CovidcastAPI}, with associated R \cite{CovidcastR} and Python
-\cite{CovidcastPy} packages for convenient data fetching and processing
-tools. In the current paper, we aim to quantify the utility provided by a core
-set of these indicators for two fundamental prediction tasks: probabilistic
-forecasting of COVID-19 case rates and prediction of future COVID-19 case
-hotspots (defined by the event that a relative increase in COVID-19 cases
-exceeds a certain threshold).
-
- At the outset, we should be clear that our intent in this paper is \textit{not}
+crucial for understanding disease spread, correctly formulating public policy
+responses, and rationally planning future public health resource needs.  A
+companion paper \cite{Reinhart:2021} describes our research group's efforts,
+beginning in April 2020, in curating, and maintaining a database of real-time
+indicators that track COVID-19 activity and other relevant phenomena. The
+signals (a term we use synonomously with ``indicators'') in this database are
+accessible through the COVIDcast API \cite{CovidcastAPI}, as well as associated
+R \cite{CovidcastR} and Python \cite{CovidcastPy} packages, for convenient data  
+fetching and processing. In the current paper, we quantify the utility provided
+by a core set of these indicators for two fundamental prediction tasks:
+probabilistic forecasting of COVID-19 case rates and prediction of future
+COVID-19 case hotspots (defined by the event that a relative increase in
+COVID-19 cases exceeds a certain threshold).  
+
+At the outset, we should be clear that our intent in this paper is \textit{not}
 to provide an authoritative take on cutting-edge COVID-19 forecasting methods.
-Instead, our purpose here is to provide a rigorous, quantitative assessment of
-the utility that several auxiliary indicators---such as those derived from
-internet surveys or medical insurance claims---provide in tasks that involve
-predicting future trends in confirmed COVID-19 cases. To assess such utility in
-as simple terms as possible, we center our study in the framework of a basic
-autoregressive model (in which COVID cases in the near future are predicted from  
-a linear combination of COVID cases in the near past), and ask whether the 
-inclusion of an auxiliary indicator as an additional feature in such a model
-improves its predictions. 
-
-While forecasting carries a rich literature that offers a wide range of
-techniques, see, e.g., \cite{Hyndman:2018}, we purposely constrain ourselves to 
-very simple models, avoiding common enhancements such as order selection,
-correction of outliers/anomalies in the data, and inclusion of regularization or
-nonlinearities. For the same reason, we do not account for other factors that
-may well aid in forecasting, such as age-specific effects, holiday adjustments,
-public health mandates, and so on. That said, analyses of forecasts submitted
-to the COVID-19 Forecast Hub \cite{ForecastHub} by a large community of modelers 
-have shown that simple, robust models have consistently been among the
-best-performing over the pandemic, including time series models similar the ones
-we use in this paper. See, e.g., \cite{Cramer:2021}, for a detailed study
-of death forecasting. Our internal evaluations suggest that the same is true (at
-least, if not to a greater extent) for case forecasting.   
+Similarly, some authors, e.g., \cite{ioannidis2020a}, have pointed out numerous
+mishaps of forecasting during the pandemic and it is not our general intent to
+fix them here. Instead, we start with a basic and yet reasonably effective
+predictive model for predicting future trends in COVID-19 cases, and present a
+rigorous, quantitative assessment of the added value provided by auxiliary
+indicators, that are derived from data sources that operate outside of 
+traditional public health streams. In particular, we consider five indicators
+derived from de-identified medical insurance claims, self-reported symptoms from
+online surveys, and COVID-related Google searches.
+
+To assess this value in as direct terms as possible, we base our study around a
+very simple basic model: an autoregressive model, in which COVID cases in the
+near future are predicted using a linear combination of COVID cases in the near
+past. Forecasting carries a rich literature, offering a wide range of
+sophisticated techniques, see, e.g., \cite{Hyndman:2018} for a review; however,
+we purposely avoid enhancements such as order selection, correction of
+outliers/anomalies in the data, and inclusion of regularization or
+nonlinearities. Similarly, we do not account for other factors that may well aid
+in forecasting, such as age-specific effects, holiday adjustments, and the
+effects of public health mandates. All that said, despite its simplicity, the
+basic autoregressive model that we consider in this paper exhibits competitive
+performance (see the Supplementary Materials for details) with many of the top
+COVID-19 case forecasters submitted to the U.S.\ COVID-19 Forecast Hub
+\cite{ForecastHub}, which is the official source of forecasts used in public
+communications by the U.S. CDC. The strong performance of the autoregressive
+model here is in line with the fact that simple, robust models have also
+consistently been among the best-performing ones for COVID-19 death
+forecasting \cite{Cramer:2021}. 
 
 In the companion paper \cite{Reinhart:2021}, we analyze correlations between
 various indicators and COVID case rates. These correlations are natural
 summaries of the contemporaneous association between an indicator and COVID
 cases, but they fall short of delivering a satisfactory answer to the question
 that motivates the current article: is the information contained in an indicator
-demonstrably useful for the prediction tasks we care about? For such a question,
-lagged correlations (e.g., measuring the correlation between an indicator and
-COVID case rates several days in the future) move us in the right direction, but
-still do not move us all the way there. The question about \textit{utility for
-  prediction} is focused on a much higher standard than simply asking about 
-correlations; to be useful in forecast or hotspot models, an indicator must
-provide relevant information that is not otherwise contained in past values of
-the case rate series itself. We will assess this in the most direct way
-possible: by inspecting the difference in predictive performance of simple
-autoregressive models trained with and without access to past values of a
-particular indicator.
-
-There is another, more subtle issue in evaluating predictive utility that 
-deserves explicit mention, as it will play a key role in our analysis.
-Signals computed from surveillance streams will often be subject to  
-latency and/or revision. For example, a signal based on aggregated medical
-insurance claims may be available after just a few days, but it can then be
-substantially revised over the next several weeks as additional claims are
-submitted and/or processed late. Correlations between such a signal and case
-rates calculated ``after the fact'' (i.e., computed retrospectively, using the
-finalized values of this signal) will not deliver an honest answer to the    
-question of whether this signal would have been useful in real time. Instead,
-we build predictive models using only the data that would have been available
-\textit{as of} the prediction date, and compare the ensuing predictions in terms 
-of accuracy. To do so, we leverage Delphi's \texttt{evalcast} R package 
-\cite{EvalcastR}, which plugs into the COVIDcast API's data versioning system,  
-and facilitates honest backtesting. 
+demonstrably useful for the prediction tasks we care about? Note that even
+lagged correlations cannot deliver a complete answer. Demonstrating
+\textit{utility for prediction} is a much higher standard than simply asking
+about correlations; to be useful in forecast or hotspot models, an indicator
+must provide relevant information that is not otherwise contained in past values
+of the case rate series itself (cf.\ the pioneering work on Granger  causality
+\cite{Granger:1969, engle1987a}, as well as the further references given 
+below). We assess this directly by inspecting the difference in predictive  
+performance of simple autoregressive models trained with and without access to  
+past values of a particular indicator. 
+
+We find that each of the five indicators we consider--- two based on
+COVID-related outpatient visits from medical insurance claims, one on
+self-reported symptoms from online surveys, and one on Google searches for
+anosmia or ageusia---provide an overall improvement in accuracy when
+incorporated into the autorgressive model. This is true both for COVID-19 case 
+forecasting and hotspot prediction. Further analysis reveals that the gains in
+accuracy depend on the pandemic's dynamics at prediction time: the biggest  
+gains in accuracy appear during times in which cases are ``flat'' or trending
+``down''; but the indicator based on Google searches offers a most notable
+improvement when cases are trending ``up''.  
+
+Careful handling of data revisions plays a key role in our analysis. Signals 
+computed from surveillance streams are often subject to latency and/or 
+revision. For example, a signal based on aggregated medical insurance claims may
+be available after just a few days, but it can then be substantially revised
+over the next several weeks as additional claims are submitted and/or processed
+late. Correlations between such a signal and case rates calculated ``after the
+fact'' (i.e., computed retrospectively, using the finalized values of this
+signal) will not deliver an honest answer to the question of whether this signal
+would have been useful in real time. Instead, we build predictive models using
+only the data that would have been available \textit{as of} the prediction date,
+and compare the ensuing predictions in terms of accuracy. The necessity of
+real-time data for honest forecast evaluations has been recognized in
+econometrics for a long time, see \cite{howrey1978a, harvey1983a, mankiw1986a,
+  trivellato1986a, diebold1991a, patterson1995a, koenig2003a, croushore2006a,
+  clark2009a, faust2009a, croushore2011a}; but it is often overlooked in
+epidemic forecasting despite its critical importance, see, e.g.,
+\cite{chakraborty2018know}.  
 
 Finally, it is worth noting that examining the importance of additional features
 for prediction is a core question in inferential statistics and econometrics,
@@ -194,10 +208,10 @@
 an active field of research from both the applied and theoretical angles;
 see, e.g., \cite{Diebold:2002, McCraken:2007, Diebold:2015, Stokes:2017,
   Lei:2018, Rinaldo:2019, Williamsom:2020, Zhang:2020, Dai:2021, Fryer:2021}.
-Our take on this problem is in line with much of this literature; however, in
-order to avoid making any explicit assumptions, we do not attempt to make formal 
-significance statements, and instead, broadly examine the stability of our
-conclusions with respect to numerous modes of analysis.
+Our take in the current work is in line with much of this literature; however,
+in order to avoid making any explicit assumptions, we do not attempt to make
+formal  significance statements, and instead, broadly examine the stability of
+our conclusions with respect to numerous modes of analysis.
 
 \section{Methods}
 
@@ -230,10 +244,10 @@ \subsection{Signals and Locations}
   Google search volume for queries that relate to anosmia or ageusia (loss of
   smell or taste), based on Google's COVID-19 Search Trends data set.
 \end{itemize}
-In short, we choose these indicators because, conceptually speaking, they
-measure aspects of an individual's disease progression that would plausibly
-precede the occurence of (at worst, co-occur  with) the report of a positive
-COVID-19 test, through standard public health reporting streams.
+We choose these indicators because, conceptually speaking, they measure aspects
+of an individual's disease progression that would plausibly precede the  
+occurence of (at worst, co-occur  with) the report of a positive COVID-19 test,
+through standard public health reporting streams. 
 
 For more details on the five indicators (including how these are precisely
 computed from the underlying data streams) we refer to
@@ -255,7 +269,7 @@ \subsection{Signals and Locations}
 (cardiovascular or neurological) are performed. The smallest HRR has a
 population of about 125,000. While some are quite large (such as the one
 containing Los Angeles, which has more than 10 million people), generally HRRs  
-are much more homogenous in size than the (approximately) 3200 counties,  
+are much more homogenous in size than the (approximately) 3200 U.S. counties,   
 and they serve as a nice middle ground in between counties and states.  
 
 HRRs, by their definition, would be most relevant for forecasting hospital
@@ -317,12 +331,12 @@ \subsection{Vintage Training Data}
 
 Lastly, our treatment of the Google-AA signal is different from the rest.
 Because Google's team did not start publishing this signal until early
-September, 2020, we do not have true vintage data before then.  Furthermore, the
-latency of the signal was always at least one week through 2020.  However,
-unlike the claims-based signals, there is no reason for revisions to occur after
-initial publication, and furthermore, the latency of the signal is not an
-unavoidable property of the data type, so we simply use finalized signal values,
-with zero latency, in our analysis.
+September, 2020, we do not have true vintage data before then; the latency of 
+the signal was always at least one week through 2020.  However, unlike the
+claims-based signals, there is no reason for revisions to occur after initial
+publication, and furthermore, the latency of the signal is not an unavoidable
+property of the data type, so we simply use finalized signal values, with zero
+latency, in our analysis. 
 
 \subsection{Analysis Tasks}
 
@@ -359,9 +373,9 @@ \subsection{Analysis Tasks}
 $Y_{\ell,t+a-7}$), again for each $a=7,\ldots,21$.   
 
 Why do we define the response variables via 7-day averaging? The short answer 
-is robustness: averaging stabilizes the case time series, and accounts for 
-uninteresting artifacts like day-of-the-week effects in the series.  Note that
-we can also equivalently view this (equivalent up to a constant factor) as
+is robustness: averaging stabilizes the case time series, and moderates
+uninteresting artifacts like day-of-the-week effects in the data.  We note
+that we can also equivalently view this (equivalent up to a constant factor) as 
 predicting the HRR-level case incidence rate \textit{summed} over some 7-day
 period in the future, and predicting a binary variable derived from this.     
 
@@ -387,10 +401,7 @@ \subsection{Analysis Tasks}
 of signals for features, as we will detail shortly) June 9, 2020 for
 forecasting, and June 16, 2020 for hotspot prediction. (The bottleneck here is
 the CTIS-CLI-in-community signal, which does not exist before early April 2020,
-when the survey was first launched). The end date was chosen again with a 
-consideration to align both tasks as best as possible, and because very few
-hotspots exist for the several few months of 2021, due to the general and
-gradual decline of the pandemic during this period.  
+when the survey was first launched). 
 
 \paragraph{Forecasting Models}
 
@@ -436,7 +447,7 @@ \subsection{Analysis Tasks}
 
 At prediction time, in order to avoid crossing violations (that is, for two
 levels $\tau' > \tau$, the predicted quantile at level $\tau$ exceeds the
-predicted quantile at level $\tau'$), we apply a simple post-hoc sorting.  See
+predicted quantile at level $\tau'$), we apply a simple post hoc sorting.  See 
 Figure~\ref{fig:trajectory} for an example forecast. 
 
 \begin{figure}[t]
@@ -489,7 +500,7 @@ \subsection{Analysis Tasks}
 
 An important detail is that in hotspot prediction we remove all data from
 training and evaluation where, on average, fewer than 30 cases (this refers to a
-count, not a rate) are observed over the preceding 7 days. This avoids having to
+count, not a rate) are observed over the prior 7 days. This avoids having to
 make arbitrary calls for a hotspot (or lack thereof) based on small counts.   
 
 \subsection{Evaluation Metrics}
@@ -499,7 +510,7 @@ \subsection{Evaluation Metrics}
 \textit{weighted interval score} (WIS), a quantile-based scoring rule; see,
 e.g., \cite{Gneiting:2007}.  WIS is a proper score, which means that its
 expectation is minimized by the population quantiles of the target variable.
-The use of WIS in COVID-19 forecast scoring is discussed in \cite{Bracher:2021};
+The use of WIS in COVID-19 forecast scoring was proposed by \cite{Bracher:2021}; 
 WIS is also the main evaluation metric used in the COVID-19 Forecast Hub. More
 broadly, the specific form of WIS used here is a standard metric in the
 forecasting community for evaluating quantile-based probabilistic forecasts,
@@ -536,8 +547,8 @@ \subsection{Evaluation Metrics}
 (first term in each summand) and an ``under/overprediction'' component (second
 term in each summand).  But the second form given in \eqref{eq:wis_quantiles} is 
 especially noteworthy in our current study because it reveals WIS to be the same
-as the quantile regression loss that we use to train our forecasting models
-(i.e., we fit by optimizing WIS averaged over the training data).
+as the quantile regression loss that we use to train our forecasting models.
+%(i.e., we fit by optimizing WIS averaged over the training data).
 
 For hotspot prediction, we evaluate the probabilistic classifiers produced by
 the logistic models in \eqref{eq:hotspot_ar} and \eqref{eq:hotspot_ar_x} using
@@ -584,15 +595,15 @@ \subsection{Other Considerations}
 Some signals have a significant amount of spatial heterogeneity, by which we
 mean their values across different geographic locations are not comparable.
 This is the case for the Google-AA signal (due to the way in which the
-underlying search trends time series is self-normalized, see
+Google search trends time series is self-normalized, see
 \cite{GoogleSymptoms}) and the claims-based signals (due to market-share 
 differences, and/or differences in health-seeking behavior).  Such spatial
 heterogeneity likely hurts the performance of the predictive models that rely on
 these signals, because we train the models on data pooled over all locations.
-In the current paper, we do not attempt to address this issue (it is again a
-topic of ongoing work in our group), and we simply note that location-specific
-effects (or pre-processing to remove spatial bias) would likely improve the
-performance of the models involving Google-AA and the claims-based indicators.
+In the current paper, we do not attempt to address this issue, and we simply
+note that location-specific effects (or pre-processing to remove spatial bias)
+would likely improve the performance of the models involving Google-AA and the
+claims-based indicators. 
 
 \section{Results}
 
@@ -603,7 +614,7 @@ \section{Results}
 indicator---namely ``CHNG-CLI'',  ``CHNG-COVID'', ``CTIS-CLI-in-community'',
 ``DV-CLI'', or ``Google-AA''---interchangeably with the model in forecasting, 
 \eqref{eq:forecast_ar_x}, or hotspot prediction, \eqref{eq:hotspot_ar_x}, that
-uses this particular indicator as a feature (the meaning should be clear from
+uses this same indicator as a feature (the meaning should be clear from
 the context).  So, for example, the CHNG-CLI model in forecasting is the one in  
 \eqref{eq:forecast_ar_x} that sets $X_{\ell,t}$ to be the value of the CHNG-CLI
 indicator at location $\ell$ and time $t$.  Finally, we use the term ``indicator
@@ -611,7 +622,7 @@ \section{Results}
 \eqref{eq:forecast_ar_x} or \eqref{eq:hotspot_ar_x} (five from each of the
 forecasting and hotspot prediction tasks).
 
-We begin with a summary of the high-level conclusions.  
+Below is summary of the high-level conclusions.  
 
 \begin{itemize}
 \item Stratifying predictions by the ahead value ($a=7,\ldots,21$), and
@@ -623,7 +634,7 @@ \section{Results}
 
 \item In the same aggregate view, CHNG-COVID and DV-CLI offer the biggest 
   gains in both forecasting and hotspot prediction.  CHNG-CLI is
-  inconsistent: it provides a big gain in hotspot prediction, but little gain 
+  inconsistent: it provides a large gain in hotspot prediction, but little gain  
   in forecasting (it seems to be hurt by a notable lack of robustness, due
   to backfill).  CTIS-CLI-in-community and Google-AA each provide decent 
   gains in forecasting and hotspot prediction.  The former's performance in 
@@ -635,18 +646,18 @@ \section{Results}
   decreasing (most notable in CHNG-COVID and CTIS-CLI-in-community), but can be
   worse than AR when case rates are increasing (this is most notable in CHNG-CLI
   and DV-CLI). More rarely does an indicator model tend to beat AR when case
-  rates are increasing, but there appears to be some evidence of this for the
-  Google-AA model.  
+  rates are increasing, but there appears to be evidence of this for the
+  Google-AA model.   
   % In an increasing period, when an indicator model is worse than AR, it tends
   % to underpredict relative to AR (its median forecast is lower).  In a flat
   % or decreasing period, when an indicator model is better than AR, AR tends
   % to overpredict relative to it (the median AR forecast is higher).   
 
-\item In this same analysis, when an indicator model performs better than AR in
-  a decreasing period, this tends to co-occur with instances in which the
+\item In this same analysis, when an indicator model performs better than AR 
+  in a decreasing period, this tends to co-occur with instances in which the 
   indicator ``leads'' case rates (meaning, roughly, on a short-time scale in a
   given location, its behavior mimics that of future case rates).  On
-  the other hand, if an an indicator model does better in periods of
+  the other hand, if an indicator model does better in periods of
   increase, or worse in periods of increase or decrease, its performance is
   not as related to leadingness.   
 \end{itemize}
@@ -668,8 +679,7 @@ \section{Results}
 examine two assumption-lean methods for assessing the statistical significance
 of our main results.
 
-Code to reproduce all results (which uses the \texttt{evalcast} R package) can 
-be found at
+Code to reproduce all results can be found at
 \url{https://github.com/cmu-delphi/covidcast-pnas/tree/main/forecast/code}. 
 
 \subsection{Aggregate Results by Ahead Value}
@@ -730,29 +740,29 @@ \subsection{Aggregate Results by Ahead Value}
 
 \subsection{Implicit Regularization Hypothesis}
 
-One might ask if the benefits we observe in forecasting and hotspot
-prediction have anything to do with the actual indicator themselves. A plausible  
-alternative explanation is that the indicators are simply providing \textit{implicit
-  regularization} on top of the basic AR model, in the same way any noise
-variable might, when we include them as lagged features in
-\eqref{eq:forecast_ar_x} and \eqref{eq:hotspot_ar_x}. 
+One might ask if the benefits observed in forecasting and hotspot
+prediction have anything to do with the actual auxiliary indicators
+themselves. A plausible alternative explanation is that the indicators are
+just providing \textit{implicit regularization} on top of the basic AR model,
+in the same way any noise variable might, if we were to use it to create lagged
+features in \eqref{eq:forecast_ar_x} and \eqref{eq:hotspot_ar_x}.    
 
 To test this hypothesis, we reran all of the prediction experiments but with
-$X_{\ell,t}$ in the each indicator models replaced with suitable random noise
+$X_{\ell,t}$ in each indicator model replaced by suitable random noise
 (bootstrap samples from a signal's history).  The results, shown and explained
-more precisely in the Supplementary Materials, are vastly different (worse) than
-the original set of results.  In both forecasting and hotspot prediction, the
-``fake'' indicator models offered essentially no improvement over the pure AR
-model, which strongly rejects (informally speaking) the implicit regularization
-hypothesis.
+more precisely in the Supplement, are vastly different (i.e., worse) than the
+original set of results.  In both forecasting and hotspot prediction, the
+``fake'' indicator models offer essentially no improvement over the pure AR
+model, which---informally speaking---strongly rejects the implicit
+regularization hypothesis. 
 
 On the topic of regularization, it is also worth noting that the use of 
-$\ell_1$ regularization (tuned by cross-validation) in fitting any of the models
-in \eqref{eq:forecast_ar}, \eqref{eq:forecast_ar_x}, \eqref{eq:hotspot_ar}, or 
-\eqref{eq:hotspot_ar_x} did not generally improve their performance
-(experiments not shown).  This is likely due to the fact that the number of
-training samples is large compared to the number of features (6,426 training
-samples and only 3--6 features). 
+$\ell_1$ regularization (tuned using cross-validation) in fitting any of the
+models in \eqref{eq:forecast_ar}, \eqref{eq:forecast_ar_x},
+\eqref{eq:hotspot_ar}, or  \eqref{eq:hotspot_ar_x} did not generally improve
+their performance (experiments not shown).  This is likely due to the fact that
+the number of training samples is large compared to the number of features
+(6,426 training samples and only 3--6 features).  
 % DJM: and that proper CV requires longer training windows, exacerbating the
 % nonstationarity, a scenario in which its behavior is poorly understood.
 
@@ -783,21 +793,20 @@ \subsection{Evaluation in Up, Down, and Flat Periods}
 whether the target occurs during a period of increasing cases rates (up),
 decreasing case rates (down), or flat case rates (flat). To define the
 increasing period, we use the same definition we used for the hotspot task in
-Table \ref{tab:analysis_tasks}.  Therefore all hotspots are ``up'', while all
-non-hotspots are either ``flat'' or ``down''.  For the ``down'' scenario, we 
-simply use the opposite of the hotspot definition: $Y_{\ell,t}$ decreases by
-more than 20\% relative to the preceding week.  
-
-While the performance of all forecasters, including AR, will generally degrade
-in up periods, different models exhibit different and interesting patterns.
-CHNG-CLI, CHNG-COVID, Google-AA, and especially CTIS-CLI-in-community have
-large right tails (displaying improvements over AR) during the down periods. 
-Google-AA and CTIS-CLI-in-community have large right tails during the flat
-periods. CHNG-CLI and DV-CLI have large left tails (poor forecasts relative to
-AR) in flat and up periods.  Google-AA is the only model that outperforms
-the AR model, on average, over up periods.  Overall, the indicators seem to help
-more during flat or down periods than up periods, with the exception of
-Google-AA.    
+Table \ref{tab:analysis_tasks}.  Therefore, all hotspots are labeled ``up'',
+while all non-hotspots are either ``flat'' or ``down''.  For the ``down''
+scenario, we simply use the opposite of the hotspot definition: $Y_{\ell,t}$
+decreases by more than 20\% relative to the preceding week.  
+
+While the performance of all models, including AR, generally degrades in up
+periods, different models exhibit different and interesting patterns. CHNG-CLI,
+CHNG-COVID, Google-AA, and especially CTIS-CLI-in-community have large right
+tails (showing improvements over AR) during the down periods.  Google-AA and
+CTIS-CLI-in-community have large right tails during the flat periods. CHNG-CLI
+and DV-CLI have large left tails (poor forecasts relative to AR) in flat and up
+periods.  Google-AA is the only model that outperforms the AR model, on average,
+in up periods.  Overall, the indicators seem to help more during flat or down
+periods than during up periods, with the exception of Google-AA.
 
 The Supplement pursues this analysis further.  For example, we examine
 classification accuracy and log-likelihood for the hotspot task and find a
@@ -807,6 +816,7 @@ \subsection{Evaluation in Up, Down, and Flat Periods}
 performance.
 
 \subsection{Effects of Leading or Lagging Behavior}
+\label{sec:leading-lagging}
 
 \begin{figure}[t]
   \includegraphics[width=\columnwidth]{fig/leading-only-1.pdf}
@@ -827,19 +837,24 @@ \subsection{Effects of Leading or Lagging Behavior}
 pronounced in down or flat periods than in up periods, a plausible explanation 
 for the decreased performance in up periods noted above.
 
-In the supplement, we show how to define a quantitative score to measure the
-leadingness of an indicator, at any time $t$ and any location $\ell$, based on
+In the supplement, we define a quantitative score to measure the 
+leadingness of an indicator, at any time $t$ and any location $\ell$, based on 
 cross correlations to case rates over a short time window around $t$.
-The higher this score, the greater it ``leads'' case activity.  Figure 
-\ref{fig:leading} displays correlations between the leadingness score of an 
-indicator and the WIS difference (AR model minus an indicator model), stratified
-by whether the target is classified as up, down, or flat.  One would naturally
-expect that the WIS difference would be positively correlated with leadingness.
-Somewhat surprisingly, this relationship turns out to be strongest in down
-periods and weakest in up periods.  In fact, it is very nearly the case that for
-each indicator, the strength of correlations only decreases as we move from down
-to flat to up  periods.  In the supplement, we extend this analysis by studying 
-analogous ``laggingness'' scores, but we do not find as clear patterns. 
+The higher this score, the greater it ``leads'' case activity. This analysis is
+closely related to Granger causality \cite{engle1987a} and draws on a large
+body of prior work that measures leadingness in economic time-series, e.g., 
+\cite{mitchell1938a, koopmans1947a, shiskin1968a, granger1969a, yeats1972a, 
+  sargent1977a, auerbach1982a, koch1988a, StockWatson1989, diebold1989a,
+  lahiri1991a, emerson1996a, hamilton1996a}. Figure \ref{fig:leading} displays
+correlations between the leadingness score of an indicator and the WIS
+difference (AR model minus an indicator model), stratified by whether the target
+is classified as up, down, or flat.  One would naturally expect that the WIS
+difference would be positively correlated with leadingness.  Somewhat
+surprisingly, this relationship turns out to be strongest in down periods and
+weakest in up periods.  In fact, it is very nearly the case that for each
+indicator, the strength of correlations only decreases as we move from down to
+flat to up periods.  In the supplement, we extend this analysis by studying
+analogous ``laggingness'' scores, but we do not find as clear patterns.
 
 \section{Discussion}
 
@@ -848,15 +863,14 @@ \section{Discussion}
 COVIDcast API (defined using from medical insurance claims, internet-based
 surveys, and internet search trends) is undoubtedly ``yes''. However, there are
 levels of nuance to such an answer that must be explained.  None of the
-indicators that we have investigated appear to be the ``silver bullet'' that one
-might have hoped for, revolutionizing the tractability of the prediction
-problem, rendering it easy when it was once hard (in the absence of auxiliary
-information).  Rather, the gains in accuracy from the indicator models (over an
+indicators that we have investigated are transformative, rendering the
+prediction problem easy when it was once hard (in the absence of auxiliary
+information). Rather, the gains in accuracy from the indicator models (over an 
 autoregressive model based only on past case rates) appear to be nontrivial, and
 consistent across modes of analysis, but modest.  In forecasting, the indicator
 models are found to be most useful in periods in which case rates are flat or
 trending down, rather than periods in which case rates are trending up (as one
-might hope to see is the benefit provided by a hypothetical ``leading
+might hope to see is the benefit being provided by a hypothetical ``leading 
 indicator'').
 
 As described previously, it is likely that we could improve the indicator models
@@ -869,32 +883,42 @@ \section{Discussion}
 single model, allowing for interaction terms, and leveraging HRR demographics or
 mobility patterns.  That said, we are doubtful that more sophisticated modeling
 techniques would change the ``topline'' conclusion---that auxiliary indicators
-can provide nontrivial, consistent, but modest gains in forecasting and hotspot
-prediction. Whether a more sophisticated model would be able to leverage the
-indicators in such a way as to change some of the finer conclusions (e.g., by 
-offering clear improvements in periods in which cases are trending up) is less 
-clear to us.
+can provide clear but modest gains in forecasting and hotspot prediction.   
+
+However, rigorously vetting the details for more sophisticated models, as well
+as the generalizability of our findings to different geographic resolutions,
+both remain important directions for future study.  For example, in the
+Supplementary Materials we show that for forecasting at the state level, the
+benefits of including indicators in the AR model are generally less clear
+(compared to those observed at the HRR level).  A plausible explanation: at the
+state level, where the signal-to-noise ratio (SNR) is higher, AR performs better
+overall and represents a higher standard (when asking whether it can be improved
+upon using the indicators).  At the HRR level, where the SNR is lower, including
+the indicators as additional linear features in the AR model probably delivers a
+kind of variance reduction (just like averaging independent terms would) which
+contributes to improved accuracy.  But as the SNR increases, this variance
+reduction becomes less important, and perhaps we must use more sophisticated
+modeling techniques to extract comparable value from the indicators.
 
 We reiterate the importance of using vintage data for rigorous backtesting. Data
 sources that are relevant to public health surveillance are often subject to
-revision, sometimes regularly (such as medical claims data) and sometimes
+revision, sometimes regularly (as in medical claims data) and sometimes
 unpredictably (such as COVID-19 case reports).  When analyzing models that are
 designed to predict future events, if we train these models and make predictions
 using finalized data, then we are missing a big part of the story.  Not only
 will our sense of accuracy be unrealistic, but certain models may degrade by a
-greater or lesser extent when they are forced to work with vintage data, so 
-backtesting them with finalized data may lead us to make modeling decisions that
-are suboptimal for true test-time performance.    
+greater or lesser extent when they are forced to reckon with vintage data, so  
+backtesting on finalized data may lead us to make modeling decisions that are
+suboptimal for true test-time performance.     
 
 In this paper, we have chosen to consider only very simple forecasting models,
 while devoting most of our effort to accounting for as much of the complexity of
-the underlying data and evaluation as possible.  In fact, our paper is as much
-about demonstrating how one might address questions about model comparisons and
-evaluation in forecasting and hotspot prediction in general, as it is about
-providing rigorous answers to such questions in the context of COVID-19 case
-rates in particular.  We hope that others will leverage our framework, and build
-on it, so that it can be used to guide work that advances the frontier of
-predictive modeling for epidemics and pandemics.
+the underlying data and evaluation as possible. In fact, our paper is not only
+about providing rigorous answers to questions about model comparisons in
+COVID-19 forecasting and hotspot prediction, but also about demonstrating how 
+one might go about answering such questions in general. We hope that others will
+leverage our work, and build on it, to guide advances on the frontier of
+predictive modeling for epidemics and pandemics.    
 
 \acknow{We thank Matthew Biggerstaff, Logan Brooks, Johannes Bracher, Michael 
   Johansson, Evan Ray, Nicholas Reich, and Roni Rosenfeld for several
diff --git a/forecast/paper/response-to-reviewers.tex b/forecast/paper/response-to-reviewers.tex
deleted file mode 100644
index 7f330f9..0000000
--- a/forecast/paper/response-to-reviewers.tex
+++ /dev/null
@@ -1,358 +0,0 @@
-\documentclass[11pt]{article}
-
-\usepackage[T1]{fontenc}
-\usepackage{geometry}
-\usepackage{microtype}
-
-\usepackage{newtxmath}
-\usepackage{newtxtext}
-\usepackage{graphicx}
-
-\usepackage{xcolor}
-\definecolor{cobalt}{rgb}{0.0, 0.28, 0.67}
-
-\newenvironment{resp}{\begin{quote}\color{cobalt}}{\end{quote}}
-\newcommand{\sresp}[1]{\textcolor{cobalt}{#1}}
-
-\setlength{\parindent}{0in}
-\setlength{\parskip}{11pt}
-
-\title{Response to peer reviews}
-\author{}
-\date{\today}
-
-\begin{document}
-\maketitle
-
-
-\section*{Editor}
-
-
-  Reviews are generally supportive, but bring up several issues that need to be
-  addressed. Once addressed, please submit the revised manuscript with an
-  item-by-item reply/rebuttal.  
-
-
-\begin{resp}
-  We thank the editor and reviewers for their comments and the opportunity to make
-  revisions. We have made a number of revisions, described in detail below.
-  We believe these changes have strengthened the manuscript, and describe them in
-  more detail below.
-  We quote below the Reviewer comment and provide our response in this color.
-\end{resp}
-
-
-\section*{Reviewer 1}
-
-
-
-  1. In their presentation of of evaluation metrics they do not discuss
-  stability. For instance in [7] does the choice of the weights, using $\tau$ for
-  $x\geq 0$ rather than some monotone function say some power $\tau^\alpha$ make a
-  difference? Since as they indicate the measure is used for fitting as well as
-  ascertainment, some stability investigation seems appropriate. I suspect here
-  is no great effect but...
-
-
-\begin{resp}
-  The reviewer is correct: the choice of $\tau$ rather than a monotone function of
-  $\tau$ will make a difference. But it seems that we should have been more clear
-  in our description of WIS. The weights used here are those used to evaluate
-  forecasts submitted to the COVID-19 Forecast Hub and the CDC. More broadly,
-  this is a standard metric in the forecasting community (just as Mean Squared
-  Forecast Error is standard), rather than an ``author decision''.  With the
-  weights as used, it is equivalent to quantile loss and therefore is a discrete
-  version of CRPS (see [7]).
-
-  In principle,
-  we would expect that different weights may alter the conclusions, though
-  likely not substantively. Applying a monotone function to $\tau$ only effectively
-  changes the quantile of interest, resulting in a mismatch between the coverage you
-  tried to get and the coverage WIS is evaluating. Applying the function to $\tau$
-  and $(1-\tau)$ results in an asymmetry, penalizing forecasts that miss on one
-  side more than on the other.
-
-  $\bullet$ We have added a sentence in the Evaluation Metrics section emphasizing that
-  this metric is standard.
-\end{resp}
-
-
-  2. They do not discuss the possible effects of vaccination rates and other
-  factors, though this may be one of the areas of localization they propose to
-  investigate.
-
-
-
-\begin{resp}
-  During the period discussed in the main paper, vaccinations had essentially no
-  effect. On Dec 31 2020, when the main evaluation period ends,  according to
-  CDC data, about 1\% of the population 
-  had received 1 dose. That said, we should probably be more specific in the
-  paragraph on lines 31--40: our goal is to use very simple models that do not
-  account for extra information (vaccinations but also strains of the virus,
-  ``super-spreader'' events, etc.).
-
-  $\bullet$ We have added a sentence around lines 30--40 emphasizing the choice
-  to ignore such factors.
-\end{resp}
-
-
-  3. This points to what is perhaps a more major issue. Can they identify in
-  quantitative terms the consequences of the additional improvements in
-  forecasting provided by these methods in terms of gains in resources available
-  to deal with the epidemic? Unfortunately as they state, in the crucial periods
-  of sharply rising cases, the additional benefit seems least.
-
-
-\begin{resp}
-  This is a good point, and we have made a few modifications described in more
-  detail below (Reviewer 2, Point 3).
-\end{resp}
-
-
-
-  4. Despite the fact that a detailed description of terms in the text is
-  impossible, it would I think be helpful to say if there is any significance to
-  the names they associate with their first 2 and 4th indicators.
-
-
-\begin{resp}
-  It seems that the Reviewer is referring to CHNG-CLI, CHNG-COVID and DV-CLI. These are
-  described in the Methods Section, Signals and Locations Subsection. To be more
-  specific, Change 
-  Healthcare is a large healthcare insurance claims processor. The difference
-  between ``-CLI'' and ``-COVID'' is that the first measures insurance
-  claims involving symptoms
-  associated with COVID-like-illness  while the second requires a medical
-  diagnosis (through 
-  testing or presumed positive by the medical provider). The ``DV'' modifier
-  corresponds to different claims processors (not Change Healthcare) who wish to
-  remain anonymous.
-
-\end{resp}
-
-
-
-  5. What are Change Healthcare claims..as opposed to others?
-
-
-\begin{resp}
-  Hopefully the discussion to point 4 above has cleared up any confusion.
-\end{resp}
-
-
-\section*{Reviewer 2}
-\begin{quote}
-  \emph{(items numbered by the editor for convenience of reference)}
-\end{quote}
-
-\subsection*{Statistical or Methodological Comments:}
-  
-1. Is this really a stationary phenomenon? Over the timerange extending from
-pre-lockdown to today, it certainly isn't. The method in this paper of
-presenting model prediction errors makes it impossible to see this. People in
-econometrics look at cumsums of normalized prediction errors. You can see change
-points and trends very easily.
-
-\begin{resp}
-  There is little reason to believe that this data is stationary. The models
-  however are trained and evaluated over reasonably short periods (2--3 months),
-  over which time the behavior may well be stationary.
-
-  $\bullet$ We have added a section
-  to the appendix that uses the cumulative sum normalized by baseline as
-  suggested. With some exceptions at the beginning of 
-  the evaluation period, the relative ordering remains fairly consistent. The AR
-  model is easily the worst.
-
-\end{resp}
-
-2. Are the results in this paper statistically significant? I see no discussion
-of this question, which seems unbelievable for me to be saying, given the
-authorship. And yet here I am.
-
-\begin{resp}
-  As noted in the introduction, there are no options available for fully 
-  rigorous and nonparametric (model-free) statistical significance testing,
-  giving the intricate spatiotemporal dependence in our forecasting problem.  
-
-  We also have strong reasons for not using model-based approaches (in part due
-  to nonstationarity, but also due to the belief that these models are not
-  applicable here). We felt that our extensive predictive comparisons in the
-  methods section and supplement were more appropriate. 
-
-  That said, to address this more directly we have made a number of additions: 
-
-  $\bullet$ We have added a section to the Supplement (and point
-  to it around 
-  line 458 in the main text), that performs a Sign Test for differences in
-  accuracy. P-values there tend to cluster near zero and one suggesting some
-  periods have significantly improved accuracy for the indicator-assisted models
-  while in others, the reverse is true.
-
-  $\bullet$ We also use a Diebold-Mariano test
-  for comparing forecast accuracy. The results for this test are mixed, as would
-  be expected given the sign test and
-  the nonstationarity the Reviewer mentions above. The indicators are
-  more helpful in some periods than others, see also the upswing/downswing
-  discussion in Section 2C. 
-\end{resp}
-
-3. Are the results in this paper practically significant? I see no discussion of
-this question, which seems unbelievable for me to be saying, given that the
-authors refuse to consider statistical significance. And yet here I am.
-
-\begin{resp}
-  See also Reviewer 1 point 3. And the general comment below (Point 8) where we
-  outline some specific changes we have made that bare on this point.
-  
-  Practically, on average, if our goal is to predict 11 days ahead, when the AR
-  model is at its best, CHNG-CLI
-  doesn’t help (at least with these simple models). However, CHNG-COVID,
-  DV-CLI, and Google-AA ``buy'' 4 extra days for the 
-  same accuracy. CTIS-CLIIC ``buys'' about 5 extra days. What we mean here is
-  that if we have a certain tolerance for inaccuracy, adding these signals increases the
-  horizon at which we can predict within our tolerance. As for the
-  practical significance of 4-5 extra days, this would of course depend on the use case.  However, cases grow
-  exponentially, so making public policy changes 15 or 16 days ahead rather than
-  11 can mean real gains. There also seems to be a fairly constant mapping
-  between cases and future hospitalizations. Giving
-  hospitals a few extra days to accumulate supplies and find personnel is
-  meaningful. For predicting hotspots, we see similar results (gains of a few
-  days) though with different indicators. 
-\end{resp}
-
-4. For the cumsum plots I mentioned above, and other related techniques, we can
-plot all the model prediction error cumsum curves versus time and thereby
-compare different models for size, nonstationarity and significance of
-prediction improvement. The plots that have been presented in Figure 4 say do
-none of those tasks.
-
-\begin{resp}
-  We added the cumulative-sum plot as described above. Additionally, the
-  Reviewer's point aligns with much of  our motivation for Fig 4. If we were
-  predicting a single location's 
-  time series, then the cum-sum plot would provide all the information.
-  However, we have a separate time series prediction problem for each HRR, and
-  the upswings/downswings occur at different dates in different HRRs, so
-  conditioning on forecast date for the comparison (as is done in a
-  cumulative-sum plot) ends up adding together errors from different phases
-  of the pandemic. In Figure 4, we instead condition on phase of the pandemic,
-  which we think is more meaningful than calendar date. 
-\end{resp}
-
-5. The idea of using Asof data is of course important but I have known about it
-for twenty years and practiced it routinely; it has apparently been in routine
-use in econometrics for 40 years.
-
-\begin{resp}
-  We appreciate the Reviewer's perspective on the ubiquity of Asof data in
-  econometrics. But the epidemic modelling community is less aware. And it
-  has a real and meaningful importance for model selection going forward. We
-  hope that this paper will lead to greater appreciation of this issue in this
-  community. 
-\end{resp}
-
-\subsection*{Comments on Significance Statement:}
-
-6. I don't know what ``all provide a nontrivial boost in accuracy'', really
-means. I guess that the general audience won't either.
-
-\begin{resp}
-  We have changed the clause to ``all provide a nontrivial improvement in
-  forecast accuracy''
-\end{resp}
-
-7. The sentence following is missing a word: depends ***on*** the pandemic...
-
-\begin{resp}
-  Fixed. Thank you for catching this.
-\end{resp}
-
-\subsection*{General Comments: }
-
-8. In my understanding there are some standard models which have been relied
-upon I guess millions of times during the pandemic. For example IHME's model.
-The authors say a sentence or so about their choice of baseline, saying what
-seems to me quite vague about the relation between their baseline and the
-heavily used existing models. Wouldn't it be better to identify a specific
-widely-used model explicitly as a baseline and then develop measures of
-improvement over that specific widely used model? Also, might the authors
-explain how the improvements observed by the authors stack up in terms of
-concrete differences over the widely used models that people have been using.
-Are there major misses that have taken place and could have been avoided?  
-
-
-\begin{resp}
-  We have tried to focus mainly on simple time series models in this paper
-  rather than comparing to other more elaborate models. A careful
-  (out-of-sample) comparison of forecasting performance for state-of-the-art
-  models can be found in reference 7 (Cramer et al.). That paper focuses on
-  forecasting deaths rather than cases and at the state-level rather than
-  disaggregated HRRs. However, their ranking (Figure 2 of Cramer et al.) puts the
-  ``COVIDhub-Baseline'' about 10th out of 27 models. As we discuss in the
-  manuscript, that baseline is  
-  the
-  same as the baseline we use for forecasting in this paper: it propagates today
-  forward and resamples the residuals to create quantiles (we have expanded the
-  description on page 6 to be more explicit). All our ``simplistic'' models beat
-  this baseline (the AR(3) does as well). For comparison, IHME (along with 17 other
-  ``state-of-the-art'' models) does not. We would argue that using any of the
-  models in this paper would be preferable to relying on many of the
-  ``widely-used'' models if the goal is forecast accuracy.
-
-  $\bullet$ We have added a sentence to the introduction (around line 40) that
-  describes the performance of simple models relative to widely-used models as
-  shown in Cramer et al.
-
-  $\bullet$ We have expanded our description of our baseline model (around line
-  457). In particular, we emphasize that it is the same as that used by the
-  COVID-19 Forecast Hub.
-
-  $\bullet$ To directly address the Reviewer's point about IHME, we include two
-  figures here
-  that compare our forecasts with state-of-the-art forecasts
-  submitted to the COVID-19 Forecast Hub. The first overlays the
-  scores from the submissions on top of Figure 3 from the manuscript. We should
-  note that the submitted forecasts are at the state level, while ours are at
-  the HRR level, but once scaled by the baseline, should be roughly comparable.
-  (Note also, these are for cases, not deaths as described in the previous
-  bullet.) 
-  Only those forecasters that submitted at least 300 forecasts (roughly 50
-  states over 6 weeks during the 6 months examined) are shown. From this Figure
-  it is clear that all of the forecasters considered in this paper beat most
-  submissions. In the second figure, we use the Geometric Mean scaled by the
-  baseline as shown in Figure S6 in the Supplement. By this metric, the methods employed in this
-  paper would rank in the top 2 of all submitted forecasts, beating the
-  COVIDhub-ensemble at all forecast 
-  horizons. The improvement of the best indicator assisted model over the AR model
-  is roughly the same as the improvement of the AR model over the
-  COVIDhub-ensemble.
-
-  \begin{figure}[tb]
-    \centering
-    \includegraphics[width=.9\textwidth]{fig/compare-to-hub-mean-1.pdf}
-    \caption{This figure reproduces Figure 3 in the main paper but overlays
-      scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey
-      lines correspond to the various teams that submitted during period our
-      evaluation period. We have highlighted the COVIDhub-ensemble, which is the
-      official forecast of the CDC.}
-    \label{fig:compare-to-hub-mean}
-  \end{figure}
-
-  \begin{figure}[tb]
-    \centering
-    \includegraphics[width=.9\textwidth]{fig/compare-to-hub-geomean-1.pdf}
-    \caption{This figure is more like Figure S6 in the Supplement. In this case,
-      we add 1 to both the forecaster WIS and the baseline WIS before scaling
-      (to allow forecasters that achieve 0 error to appear), and we overlay
-      scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey
-      lines correspond to the various teams that submitted during period our
-      evaluation period. We have highlighted the COVIDhub-ensemble, which is the
-      official forecast of the CDC.}
-    \label{fig:compare-to-hub-geomean}
-  \end{figure}
-\end{resp}
-
-
-\end{document}
\ No newline at end of file
diff --git a/forecast/paper/supplement-text.tex b/forecast/paper/supplement-text.tex
index b12cb3b..e68e037 100644
--- a/forecast/paper/supplement-text.tex
+++ b/forecast/paper/supplement-text.tex
@@ -22,162 +22,246 @@ \section{Finalized Versus Vintage Data}
 
 \chngcli~(and, to a lesser extent, the other claims-based signals) is the most
 affected by this distinction, reflecting the latency in claims-based reporting.
-This underscores the importance of efforts to provide ``nowcasts'' for claims
+This highlights the importance of efforts to provide ``nowcasts'' for claims
 signals (which corresponds to a 0-ahead forecast of what the claims signal's
 value will be once all data has been collected). Looking at the \chngcli~and
 \dv~curves in Figure \ref{fig:fcast-finalized}, we can see that they perform
 very similarly when trained on the finalized data.  This is reassuring because
 they are, in principle, measuring the same thing (namely, the percentage of
-outpatient visits that are primarily about COVID-related symptoms).  The
-substantial difference in their curves in Figure 3 of the main paper must
-therefore reflect their having very different backfill profiles.
+outpatient visits that are primarily about COVID-related symptoms), but based on
+data from different providers.  The substantial difference in their curves in
+Figure 3 of the main paper must, therefore, reflect their having very different
+backfill profiles. 
   
 While using finalized rather than vintage data affects \dv~the least for
 forecasting, it is one of the most affected methods for the hotspot problem.
 This is a reminder that the forecasting and hotspot problems are fundamentally
-distinct.  For example, the hotspot problem does not measure the ability to
-distinguish between flat and downward trends.
+different problems.  For example, the hotspot problem does not measure the
+ability to distinguish between flat and downward trends.
 
 Even the \ar~model is affected by this distinction, reflecting the fact that the
 case rates themselves (i.e., the response values) are also subject to revision.
 The forecasters based on indicators are thus affected both by revisions to the
-indicators and by revisions to the case rates.  In the case of the \gs~model, in
-which we only used finalized values for the \gs~indicator, the difference in
-performance can be wholly attributed to revisions of case rates.
+indicators and by revisions to the case rates.  And in the case of the
+\gs~model, in which we only used finalized values for the \gs~indicator, the
+difference in performance can be wholly attributed to revisions of case rates. 
 
-\section{Aggregation with Geometric Mean}
+\section{Robust Aggregation}
+
+In this section, we consider using the geometric mean instead of the usual
+(arithmetic) mean when aggregating the weighted interval score (WIS) across
+location-time pairs.  Aside from the geometric mean being generally more robust
+to large values, there are two reasons why using it may be desirable.  
 
-In this section, we consider using the geometric mean instead of the arithmetic
-mean when aggregating the weighted interval score (WIS) across location-time
-pairs.  There are three reasons why using the geometric mean may be desirable.
 \begin{enumerate}
 \item WIS is right-skewed, being bounded below by zero and having occasional 
   very large values.  Figure \ref{fig:wis-densities} illustrates that the
-  densities appear roughly log-Gaussian.  The geometric mean is a natural choice  
-  in such a context since the relative ordering of forecasters is determined by
-  the arithmetic mean of the {\em logarithm} of their WIS values.
+  densities appear roughly log-Gaussian.  The geometric mean is a natural choice   
+  in such a context since it can be viewed as a measure of centrality on the log
+  scale (it is the exponential of the arithmetic mean of log values).
 \item In the main paper, we report the ratio of the mean WIS of a forecaster to
   the mean WIS of the baseline forecaster. Another choice could be to take the
   mean of the ratio of WIS values for the two methods. This latter choice would
   penalize a method less for doing poorly where the baseline forecaster also
-  does poorly.  Using instead the geometric mean makes the order of aggregation
-  and scaling immaterial since the ratio of geometric means is the same as the 
-  geometric mean of ratios.
-\item If one imagines that a forecaster's WIS is composed of multiplicative
-  space-time effects $S_{\ell,t}$ shared across all forecasters,
-  i.e., $\WIS(F_{\ell,t,f},Y_{\ell,t})=S_{\ell,t}E_{f,t}$ with $E_{f,t}$ a
-  forecaster-specific error, then taking the ratio of two forecasters' geometric
-  mean WIS values will effectively cancel these space-time effects. 
+  does poorly.\footnote{In a sense, this is implicitly estimating a
+    nonparametric space-time effect for forecaster error, and assuming that has
+    a shared, multiplicative contribution to forecaster errors.  That is, if one
+    imagines that a forecaster's WIS is composed of multiplicative space-time
+    effects $S_{\ell,t}$ shared across all forecasters,
+    \smash{$\WIS(F_{\ell,t,f},Y_{\ell,t}) = S_{\ell,t} E_{f,t}$} with
+    \smash{$E_{f,t}$} a forecaster-specific error, then taking the ratio of
+    individual WIS values cancels these space-time effects.}  Using instead the
+  geometric mean makes the order of aggregation and scaling immaterial since the
+  ratio of geometric means is the same as the geometric mean of ratios.
 \end{enumerate}
 
 Figure \ref{fig:fcast-adjusted} uses the geometric mean for aggregation.
-Comparing this with Figure 3 of the main paper, we see that the main conclusions
+Comparing this with Figure 3 in the main paper, we see that the main conclusions
 are largely unchanged; however, \chngcli~now appears better than \ar.  This
 behavior would be expected if \chngcli's poor performance is attributable to a
 relatively small number of large errors (as opposed to a large number of
 moderate errors).  Indeed, Figure 5 of the main paper further corroborates this,
-in which we see the heaviest left tails occuring for \chngcli.
+in which we see the heaviest left tails occur for \chngcli.
+
+\section{Comparing COVID-19 Forecast Hub Models}
+
+Since July of 2020, modelers have been submitting real-time forecasts of
+COVID-19 case incidence to the COVID-19 Forecast Hub \cite{ForecastHub}. This 
+(along with forecasts of hospitalizations and deaths collected in the same Hub)
+serves as the source of the CDC's official communications on COVID forecasting. 
+
+Our goal in this section is to compare the AR model and indicator models to
+those in the Hub, in terms of forecast errors aggregated over the same forecast
+tasks, to give a clear sense for how robust and effective the models we choose
+to investigate in the paper are relative to those in common operational use.
+This was prompted by a question from an anonymous reviewer of this paper, who
+asked why we chose to build our analysis of indicator utility around the AR
+model in the first place, and why we did not build it around others (say, the
+SIR model or more complex mechanistic models of disease transmission) that have 
+occupied more of the spotlight over the course of the pandemic.  The analysis
+presented here corroborates the claim that, while simple, the AR model, properly
+trained---using a quantile loss to directly estimate multiple conditional
+quantiles, a trailing training window of 21 days, pooling across all locations
+jointly, and fitting to case rates rather than counts (as we do in all our
+models in the main paper)---can be robust and effective, performing
+competitively to the top models submitted to the COVID-19 Forecast Hub,
+including the Hub's ensemble model.
+
+The closest forecast target in the Hub to that used in the main paper is 
+state-level case incidence over an epiweek---defined by the sum of new case
+counts reported between a Sunday and the following Saturday (inclusive). Our
+forecast target, recall, is a 7-day trailing average of COVID-19 case incidence
+rates at the HRR level, which is different in three regards:
+
+\begin{enumerate}
+\item temporal resolution (daily versus weekly);
+\item geographic resolution (HRRs versus states); 
+\item scale (rates versus counts).
+\end{enumerate}
+
+While the first and third of these differences could be easily addressed post 
+hoc---meaning, we can take always take our model's output and multiply it by 
+7 in order to track the incidence over any given trailing week, and rescale it
+per location by population to bring it to the count scale---the second
+difference is not easy to adjust post hoc due to nonlinearity of the quantiles
+(a quantile of a linear combination of random variables is not simply the linear
+combination of their quantiles, but rather, depends intricately on the
+correlations between the random variables).    
+
+Therefore, to make the comparison to models in the Hub as direct as possible, we
+retrained our models over the same forecast period as in the main paper, and
+with the same general setup entirely, except at the state rather than HRR level.
+We then rescaled them post hoc to account for the different temporal resolution
+and the rate-versus-count scaling (first and third points in the above list).
+The 
+results are given in Figure \ref{fig:compare-to-hub}.  The evaluation was
+carried out exactly as in the main paper, and the figure displays both mean WIS
+and geometric mean WIS, as a function of ahead, relative to the baseline model.
+Furthermore, to account for missingness (not all teams submitted forecasts to
+the Hub for all locations and ahead values for the entire period), we first
+dropped any forecaster than submitted for less than 6 weeks, and then restricted
+the aggregation metrics (mean or geometric mean) to errors from commonly
+available forecast tasks.
+
+By either metric, mean or geometric mean WIS relative to baseline, we can see  
+in Figure \ref{fig:compare-to-hub} that the AR model examined in this paper is 
+competitive with top models in the Hub, even outperforming the Hub ensemble
+model for smaller ahead values.  The same general conclusion can be drawn for
+the indicator-assisted models as well.  However, interestingly, a close inspection
+reveals that the AR model here is for the most part in the ``middle of the
+pack'' when compared to the indicator models, and only the Google-AA model
+offers clear improvement over AR for all aheads.  This is likely due to the fact
+that at the state level, the signal-to-noise ratio (SNR) is generally higher,
+and AR model provides a higher standard (on which to expect improvement using an 
+auxiliary indicator), since it is able to extract a clearer signal from past
+lags of case rates.  At the HRR level, with lower SNR, using indicators as
+simple additional linear features in the AR model probably leads to a variance
+reduction that is enough to boost accuracy, but at the state level, perhaps more
+sophisticated modeling techniques are needed to extract comparable value from
+some of the indicators.
 
 \section{Statistical Significance}
 
-In the Introduction of the main paper, we give some reasons that we avoid making
-formal statements about statistical significance, preferring instead to examine
-the stability of our results in different contexts. There are strong reasons to
-avoid model-based significance tests because the necessary assumptions about
-stationarity, independence, and the  model being true (or at least approximately
-true) are certainly violated. With those caveats in mind, we undertake two
-relatively assumption-lean investigations in this section. The first is a
-sign-test for whether the difference between the AR model’s relative WIS  and
-each other model’s relative WIS is centered at zero. By ``relative WIS'' we mean
-scaled by the strawman as displayed in Figure 3. To mitigate the dependence
+In the introduction of the main manuscript, we gave some reasons that we avoid
+making formal statements about statistical significance, preferring instead to
+examine the stability of our results in different contexts.  There are strong
+reasons to avoid model-based significance tests because the necessary
+assumptions about stationarity, independence, and the model being true (or at
+least approximately true) are certainly violated.  With those caveats in mind,
+we undertake two relatively assumption-lean investigations in this section.  The
+first is a sign test for whether the difference between the AR model’s relative
+WIS and each other model’s relative WIS is centered at zero.  (Relative WIS here
+means scaled by the WIS of the baseline model.)  To mitigate the dependence 
 across time (which intuitively seems to matter more than that across space), we
 computed these tests in a stratified way, where for each forecast date we run a
-sign test on the scaled errors between two models over all 440 counties. The
-results are plotted as histograms in Figure \ref{fig:sign-test}. In this case,
-we use the total relative WIS over all aheads, but the histograms are largely
-similar for individual target horizons. If there were no difference, we would
-expect to see a uniform distribution. However, for all indicator-assisted
-models, we see many more small p-values than would be expected if the null
-hypothesis (that the AR model is better) were true.
+sign test on the scaled errors between two models over all 306 HRRs.  The
+results are plotted as histograms in Figure \ref{fig:sign-test}.  For this
+figure, we use the total relative WIS over all aheads, but the histograms are
+largely similar for individual target horizons. If there were no difference, we
+would expect to see a uniform distribution. However, for each indicator model, 
+we see many more small p-values than would be expected if the null hypothesis
+(that the AR model is better) were true. 
 
 Another relatively assumption-lean method of testing for differences in forecast
-accuracy is the Diebold-Mariano test \cite{Diebold:2002, Diebold:2015,
-Harvey:1997}. Essentially, the differences between forecast losses are assumed
-to have a constant mean and a covariance that depends on time. Under these
+accuracy is the Diebold-Mariano (DM) test \cite{Diebold:2002, Diebold:2015,
+Harvey:1997}. Essentially, the differences between forecast errors are assumed
+to have a constant mean and a covariance that depends on time.  Under these
 conditions, the asymptotic distribution for the standardized mean of the
 differences is limiting normal provided that a heteroskedasticity and
-autocorrelation robust estimate of the variance is used. Using the loss as
-weighted interval score across all HRRs and horizons (7 to 21 days ahead), we
-perform the DM test using both the mean relative to the strawman (as reported in
-the manuscript) and the geometric mean relative to the strawman as described
-above. The first two rows of Table \ref{tab:dm-test} displays p-values for the
-test that the indicator-assisted model is no-better than the AR model. Only the
-CHNG-CLI model's p-value exceeds conventional statistical significance
-thresholds.
+autocorrelation robust estimate of the variance is used.  Using the error as WIS
+across all HRRs and horizons (7 to 21 days ahead), we perform the DM test using
+both the mean relative to the baseline (as reported in the manuscript) and the
+geometric mean relative to the baseline as described above.  The first two rows
+of Table \ref{tab:dm-test} displays p-values for the test that each indicator
+model is no better than the AR model.  In only a few instances---geometric mean
+and mean for the CHNG-CLI model, and mean for the CHNG-COVID model---do the
+p-values exceed conventional statistical significance thresholds. 
 
 \section{Bootstrap Results}
 
-As explained in Section 2.B. of the main paper, a (somewhat cynical) hypothesis
+As explained in Section 2.B of the main paper, a (somewhat cynical) hypothesis
 for why we see benefits in forecasting and hotspot prediction is that the
 indicators are not actually providing useful information but they are instead
 acting as a sort of ``implicit regularization,'' leading to shrinkage on the
 autoregressive coefficients and therefore to less volatile predictions.  To
 investigate this hypothesis, we consider fitting ``noise features'' that in
-truth should have zero coefficients.  Recall (from the main paper) that at each
-forecast date, we train a model on 6,426 location-time pairs.  Indicator models
-are based on six features, corresponding to the three autoregressive terms and
-the three lagged indicator values.  To form noise indicator features, we replace
+truth have zero relationship to the response.  Recall (from the main paper) that
+at each forecast date, we train a model on 6,426 location-time pairs.  Each
+indicator model uses 6 features, corresponding to the 3 autoregressive terms and
+the 3 lagged indicator values.  To form noise indicator features, we replace
 their values with those from a randomly chosen time-space pair (while keeping
 the autoregressive features fixed).  In particular, at each location $\ell$ and
 time $t$, for the forecasting task we replace the triplet $(X_{\ell,t},
-X_{\ell,t-7}, X_{\ell,t-14})$ in Eq. (3) of the main paper with the triplet
-$(X_{\ell^*,t^*}, X_{\ell^*,t^*-7}, X_{\ell^*,t^*-14})$, where $(\ell^*,t^*)$ is
-a location-time pair sampled with replacement from the 6,426 location-time
-pairs.  Likewise in the hotspot prediction task, we replace the triplet
-$(X_{\ell,t}^\Delta, X_{\ell,t-7}^\Delta, X_{\ell,t-14}^\Delta)$ in Eq. (5) of
-the main paper with $(X_{\ell^*,t^*}^\Delta, X_{\ell^*,t^*-7}^\Delta,
-X_{\ell^*,t^*-14}^\Delta)$.  Figures
+X_{\ell,t-7}, X_{\ell,t-14})$ in Eq.\ (3) of the main
+paper with the triplet \smash{$(X_{\ell^*,t^*}, X_{\ell^*,t^*-7},
+  X_{\ell^*,t^*-14})$}, where $(\ell^*,t^*)$ is a location-time pair sampled with
+replacement from the 6,426 total location-time pairs.  Likewise in the hotspot 
+prediction task, we replace the triplet \smash{$(X_{\ell,t}^\Delta,
+  X_{\ell,t-7}^\Delta, X_{\ell,t-14}^\Delta)$} in Eq.\ (5) of the main paper with
+\smash{$(X_{\ell^*,t^*}^\Delta, X_{\ell^*,t^*-7}^\Delta,
+  X_{\ell^*,t^*-14}^\Delta)$}.  Figures
 \ref{fig:fcast-booted}--\ref{fig:hot-booted} show the results.  No method
 exhibits a noticeable performance gain over the \ar~method, leading us to
-dismiss the implicit regularization hypothesis.
+dismiss the implicit regularization hypothesis. 
 
 \section{Upswings and Downswings}
 
-In this section we provide extra details about the upswing / flat / downswing
-analysis described in the main text. Figure \ref{fig:upswing-summary} shows the
-overall results, examining the average difference $\WIS(AR) - \WIS(F)$ in
-period. Figure \ref{fig:hotspots-upswing-downswing} shows the same information
-for the hotspot task. On average, during downswings and flat periods, the
-indicator-assisted models have lower classification error and higher log
-likelihood than the AR model. For hotspots, both Google-AA and CTIS-CLIIC
-perform better than the AR model during upswings, in contrast to the forecasting
-task, where only Google-AA improves. For a related analysis, Figure
-\ref{fig:cor-wis-ratio} shows histograms of the Spearman correlation (Spearman's
-$\rho$, a rank-based measure of association) between the $\WIS(F)/\WIS(AR)$ and
-the magnitude of the swing. Again we see that case rate increases are positively
-related to diminished performance of the indicator models.
+In this section, we provide extra details about the upswing/downswing analysis
+described in the main manuscript, Section 2.C. Figure \ref{fig:upswing-summary}
+shows the overall results, examining the average difference $\WIS(\mathrm{AR}) - 
+\WIS(F)$ for each forecaster $F$, in each in each period.  Figure
+\ref{fig:hotspots-upswing-downswing} shows the same information for the hotspot 
+task.  On average, during downswings and flat periods, the indicator models have
+lower classification error and higher log likelihood than the AR model.  For
+hotspots, both Google-AA and CTIS-CLIIC perform better than the AR model during
+upswings, in contrast to the forecasting task, where only Google-AA improves.
+In a related analysis, Figure \ref{fig:cor-wis-ratio} shows histograms of the
+Spearman correlation (Spearman's $\rho$, a rank-based measure of association)
+between the ratio $\WIS(F)/\WIS(\mathrm{AR})$ and the magnitude of the swing.
+Again we see that case rate increases are positively related to diminished
+performance of the indicator models.
 
 One hypothesis for diminished relative performance during upswings is that the
-AR model tends to overpredict downswings and underpredict upswings. Adding
+AR model tends to overpredict downswings and underpredict upswings.  Adding
 indicators appears to help avoid this behavior on the downswing but not as much
-on upswings. Figure \ref{fig:upswing-corr-table} shows the correlation between
-between $\WIS(AR) - \WIS(F)$ and the difference of their median forecasts.
-During downswings, this correlation is large, implying that improved relative
-performance of $F$ is related to making lower forecasts than the AR model. The
-opposite is true during upswings. This is largely to be expected. However, the
-relationship attenuates in flat periods and during upswings. That is, when
-performance is better in those cases, it may be due to other factors than simply
-making predictions in the correct direction, for example, narrower confidence
-intervals.
-
-It is important to note, that even though some indicators, notably CHNG-COVID 
-and CHNG-CLI underperform relative to the AR model during upswings, all models 
-dramatically outperform the baseline in such periods. Figure 
-\ref{fig:upswing-summary-remake} shows the performance of all forecasters relative
-to the null model. All forecasters suffer relative to the baseline during down 
-periods, but the AR is the worst. In contrast, all models beat the baseline
-during up periods, even CHNG-COVID and CHNG-CLI, though not by quite as much as the
-AR does.
+on upswings.  Figure \ref{fig:upswing-corr-table} shows the correlation between
+between $\WIS(\mathrm{AR}) - \WIS(F)$ and the difference of their median
+forecasts.  During downswings, this correlation is large, implying that improved
+relative performance of $F$ is related to making lower forecasts than the AR
+model.  The opposite is true during upswings.  This is largely to be expected.
+However, the relationship attenuates in flat periods and during upswings. That
+is, when performance is better in those cases, it may be due to other factors
+than simply making predictions in the correct direction, for example, narrower
+confidence intervals.
+
+It is important to note that, even though some indicators---notably CHNG-COVID
+and CHNG-CLI---underperform relative to the AR model during upswings, all models
+dramatically outperform the baseline in such periods.  Furthermore, Figure
+\ref{fig:upswing-summary-remake} shows the performance of all forecasters
+relative to the baseline model.  All forecasters suffer relative to the baseline
+during down periods, but the AR is the worst. In contrast, all models beat the
+baseline during up periods, even CHNG-COVID and CHNG-CLI, though not by quite as
+much as the AR does.
 
 \section{Leadingness and Laggingness}
 
@@ -185,26 +269,17 @@ \section{Leadingness and Laggingness}
 are leading or lagging case rates during different periods. To define the amount
 of leadingness or laggingness at location $\ell$, we use the cross correlation
 function (CCF) between the two time series. The $\CCF_\ell(a)$ of an indicator
-$X_{\ell}$ and case rates $Y_{\ell}$ is defined as
-% \[
-% \CCF_{\ell,t}(a) = \frac{1}{n_a} \sum_{i=1}^{n_a} \left( ( X_{\ell,t,i+a} -
-%   \overline{X}_{\ell,t}) / s^{X}_{\ell,t}\right) \left( ( Y_{\ell,t,i} -
-%   \overline{Y}_{\ell,t}) / s^{Y}_{\ell,t}\right), 
-% \]
-% where $n_a$ is the number of available time points when $X_{\ell,t}$ has been
-% shifted in time by $a$ days, and $\overline{X}_{\ell,t}$, $s^X_{\ell,t}$
-% are the sample mean and
-% standard deviation of $X_{\ell,t}$ (respectively $Y_{\ell,t}$). The result is a
-their Pearson correlation where $X_\ell$ has been aligned with the values of
-$Y_{\ell}$ that occurred $a$ days earlier.  Thus, for any $a>0$,
-$\CCF_{\ell}(a)>0$ indicates that $Y_{\ell,t}$ is moving together with
-$X_{\ell,t+a}$.  In this case we say that $X_{\ell}$ is lagging $Y_{\ell}$. For
-$a<0$, $\CCF_{\ell}(a)>0$ means that $Y_{\ell,t}$ is positively correlated with
-$X_{\ell,t-a}$, so we say that $X_{\ell}$ leads $Y_{\ell}$.
+series $X_{\ell}$ and case rate series $Y_{\ell}$ is defined as their Pearson 
+correlation where $X_\ell$ has been aligned with the values of $Y_{\ell}$ that
+occurred $a$ days earlier.  Thus, for any $a>0$, $\CCF_{\ell}(a)>0$ indicates
+that $Y_{\ell,t}$ is moving together with $X_{\ell,t+a}$.  In this case we say
+that $X_{\ell}$ is lagging $Y_{\ell}$. For $a<0$, $\CCF_{\ell}(a)>0$ means that
+$Y_{\ell,t}$ is positively correlated with $X_{\ell,t-a}$, so we say that
+$X_{\ell}$ leads $Y_{\ell}$.
 
 Figure \ref{fig:ccf-dv-finalized} shows the standardized signals for the HRR
 containing Charlotte, North Carolina, from August 1, 2020 until the end of
-September. These are the same signals shown in Figure 1 in the manuscript but
+September. These are the same signals shown in Figure 1 in the manuscript, but 
 using finalized data. To define ``leadingness'' we compute $\CCF_{\ell}(a)$ (as
 implemented with the R function {\tt ccf()}) for each $a\in \{-15,\ldots,15\}$
 using the 56 days leading up to the target date. This is the same amount of data
@@ -221,12 +296,12 @@ \section{Leadingness and Laggingness}
 28 in Charlotte, \dv~is leading cases leading but not lagging.
 
 Figure \ref{fig:lagging-only} shows the correlation between laggingness and the
-difference in indicator WIS and AR WIS. Unlike leadingness (Figure 5 in the
+difference in forecaster WIS and AR WIS. Unlike leadingness (Figure 5 in the
 manuscript) there is no obvious relationship that holds consistently across
-indicators. This is heartening as laggingness should not aid forecasting
+indicators. This is encouraging as laggingness should not aid forecasting 
 performance. On the other hand, if an indicator is more lagging than it is
 leading, this may suggest diminished performance.
-Figure~\ref{fig:diff-in-lead-lag} shows the correlation of the difference in
+Figure \ref{fig:diff-in-lead-lag} shows the correlation of the difference in
 leadingness and laggingness with the difference in WIS. The pattern here is
 largely similar to the pattern in leadingness described in the manuscript: the
 relationship is strongest in down periods and weakest in up periods with the
@@ -237,51 +312,35 @@ \section{Leadingness and Laggingness}
 the forecast. That is we are using the same data to evaluate predictive accuracy
 as to determine leadingness and laggingness. It should be noted that the
 leadingness of the indicator at the time the model is trained may also be
-important. Thus, we could calculate separate leadingness and laggingness scores
+important.  Thus, we could calculate separate leadingness and laggingness scores
 for the trained model and for the evaluation data and examine their combination
-in some way. We do not pursue this combination further and leave this
-investigation for future work.
-
-% \section{Examining data in 2021}
-
-% In this section, we investigate the sensitivity of the results to the
-% period over which we train and evaluate the models.  In the main
-% paper, we end all evaluation on December 31, 2020.  Figures
-% \ref{fig:fcast-alldates}--\ref{fig:hot-alldates} show how the
-% results would differ if we extended this analysis through March
-%   31, 2021. Comparing Figure \ref{fig:fcast-alldates} to Figure 3 of
-% the main paper, one sees that as ahead increases most methods now improve
-% relative to the baseline forecaster. When compared to other methods, 
-% \chngcli~appears much better than
-% it had previously; however, all forecasters other than \chngcov~and
-% \dv~are performing less well relative to the baseline than before.
-% These changes are likely due to the differing nature of the pandemic
-% in 2021, with flat and downward trends much more common than upward
-% trajectories.  Indeed, the nature of the hotspot prediction problem is
-% quite different in this period.  With a 21-day training window, it is
-% common for there to be many fewer hotspots in training.
-
-\section{Aggregation Over Time and Space}
+in some way.  However, we do not pursue this further.
+
+\section{Disaggregation Over Time and Space}
 
 Following the suggestion of an anonymous reviewer, we investigate two
 other disaggregated versions of the main forecasting result shown in Figure 3 on the
-manuscript. The first (Figure \ref{fig:cumulative-mean}) displays the cumulative
-error (summed over all horizons) of each forecaster divided by the cumulative
-error of the baseline.  This perspective should illustrate how the models
-perform over time, drawing attention to any nonstationary behavior.  During the
-initial increase in cases in July 2020, CTIS-CLIIC and Google-AA gain a lot
-relative to the AR model. All the indicators do much better than the AR model
-during the following downturn (the ebb of the second wave). The AR model
-actually improves over the indicators in October 2020, before losing a bit in
-late November. The second (Figure \ref{fig:cumulative-geo-mean}) repeats this 
-analysis but aggregating by Geometric mean as described above and displays a
-similar pattern.
-
-Figure \ref{fig:errs-in-space} examines the spatial behavior over the entire
-period of the indicator-assisted models relative to the AR. For ease of
-comparison, we show the percent improvement in each HRR. Negative numbers (blue)
-mean that the indicator helped while positives (red) mean that the indicator
-hurt forecasting performance. The clear pattern is that in most HRRs, the
-indicators improved performance, though usually by small amounts (2.5\%--10\%).
-In some isolated HRRs, performance was markedly worse, though there does not
-appear to be any particular pattern to these locations.
+manuscript. Below we use the term ``error'' to refer to the WIS summed over all
+ahead values.  The first (Figure \ref{fig:cumulative-mean}) displays the
+cumulative error, up through any point in time, of each forecaster divided by
+the cumulative error of the baseline.  This perspective should illustrate how
+the models perform over time, drawing attention to any nonstationary behavior.
+During the initial increase in cases in July 2020, CTIS-CLIIC and Google-AA gain
+quite a bit acurracy compared to the AR model.  All the indicators do much
+better than the AR model during the following downturn (the ebb of the second 
+wave). The AR model actually improves over the indicators in October 2020,
+before losing a bit in late November. % The second (Figure
+% \ref{fig:cumulative-geo-mean}) repeats this analysis but aggregating by 
+% geometric mean, and presents a similar message.  
+
+Figure \ref{fig:errs-in-space} examines the spatial behavior of errors over the
+entire period of the indicator models relative to the AR.  For ease of
+comparison, we show the percent improvement in each HRR.  Negative numbers
+(blue) mean that the indicator helped while positives (red) mean that the
+indicator hurt forecasting performance.  The clear pattern is that in most HRRs,
+the indicators improved performance, though usually by small amounts
+(2.5\%--10\%).  In some isolated HRRs, performance was markedly worse, though
+there does not appear to be any particular pattern to these locations.
+Interestingly, the geographic patterns of improvement differ quite a lot in
+between the indicators.  This suggests that a forecasting model that carefully
+combines all of indicators could be a considerable improvement. 
diff --git a/forecast/paper/supplement.Rmd b/forecast/paper/supplement.Rmd
index 36f3c7d..fa620cf 100644
--- a/forecast/paper/supplement.Rmd
+++ b/forecast/paper/supplement.Rmd
@@ -9,13 +9,8 @@ params:
   flag_jumps: 15
 ---
 
-
-
-
 \input{supplement-text.tex}
 
-
-
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = FALSE, fig.width = 6.5, fig.height = 4.5,
                       fig.align = "center",
@@ -111,12 +106,8 @@ hrr_tab <- pull(hrr_names, hrrname)
 names(hrr_tab) <- pull(hrr_names, hrrnum)
 ```
 
-
-
 <!-- All figures and tables below this line -->
 
-
-
 <!-- Finalized v. vintage -->
 
 ```{r fcast, include=FALSE}
@@ -131,7 +122,7 @@ fig3
 ```{r hot, include=FALSE}
 # Figure 4 in the manuscript
 fig4 <- plotter_hotspots(hotspots_honest) + 
-  geom_hline(yintercept = 0.5)
+  geom_hline(yintercept = 0.5, linetype = "dashed")
 fig4
 ```
 
@@ -143,9 +134,6 @@ comb1 <- plot_grid(fig3 + theme(legend.position = "none"),
 plot_grid(comb1, leg, ncol = 1, rel_heights = c(1,.1))
 ```
 
-
-
-
 ```{r fcast-finalized, fig.cap="Forecasting performance using finalized data. Compare to Figure 3 in the manuscript."}
 plotter(fcasts_finalized, 
         "wis", Mean, scaler = "strawman_wis", 
@@ -153,19 +141,16 @@ plotter(fcasts_finalized,
   ylab("Mean WIS (relative to baseline)")
 ```
 
-
 \clearpage
 
-
 ```{r hot-finalized, fig.cap="Hotspot prediction performance using finalized data. Compare to Figure 4 in the manuscript."}
 plotter_hotspots(hotspots_finalized) +
-  geom_hline(yintercept = 0.5)
+  geom_hline(yintercept = 0.5, linetype = "dashed")
 ```
 
 \clearpage
 
-
-```{r fcast-honest-v-finalized, fig.cap="Relative forecast WIS with vintage compared to finalized data. Using finalized data leads to overly optimistic performance."}
+```{r fcast-honest-v-finalized, fig.cap="Forecast performance with vintage compared to finalized data. Using finalized data leads to overly optimistic performance."}
 plotter(
   left_join(
     fcasts_honest %>% select(forecaster:wis),
@@ -174,14 +159,13 @@ plotter(
     ), 
   "wis", Mean, scaler = "finalized_wis", 
   order_of_operations = c("aggr","scale")) +
-  geom_hline(yintercept = 1) +
+  geom_hline(yintercept = 1, linetype = "dashed") +
   ylab("Mean WIS (vintage / finalized)")
-
 ```
 
 \clearpage
 
-```{r hot-honest-v-finalized, fig.cap="Relative AUC with vintage compared to finalized data. Using finalized data leads to overly optimistic hotspot performance."}
+```{r hot-honest-v-finalized, fig.cap="Hotspot prediction performance with vintage compared to finalized data. Using finalized data leads to overly optimistic performance."}
 left_join(
   hotspots_honest %>%
     group_by(forecaster, ahead) %>%
@@ -195,27 +179,21 @@ left_join(
   theme_bw() + 
   scale_color_manual(values = fcast_colors, guide = guide_legend(nrow=1)) +
   theme(legend.position = "bottom", legend.title = element_blank()) +
-  geom_hline(yintercept = 1) +
+  geom_hline(yintercept = 1, linetype = "dashed") +
   ylab("AUC (vintage / finalized)")
 ```
 
-
-
-
 \clearpage
 
-
-
 <!-- Geometric mean justifications -->
 
-
-```{r wis-densities, fig.cap="Weighted interval score appears to more closely resemble a log-Gaussian distribution."}
+```{r wis-densities, fig.cap="WIS values from forecast models, which appear to be roughly log-Gaussian."}
 ggplot(fcasts_honest, 
        aes(wis, fill = forecaster)) +
   geom_density() +
   scale_x_log10() +
   theme_bw() +
-  xlab("WIS") +
+  xlab("WIS") + ylab("Density") +
   scale_fill_manual(values = fcast_colors) +
   facet_wrap(~forecaster) +
   theme(legend.position = "none")
@@ -223,19 +201,26 @@ ggplot(fcasts_honest,
 
 \clearpage
 
-```{r fcast-adjusted, fig.cap="Relative forecast performance using vintage data and summarizing with the more robust geometric mean."}
+```{r fcast-adjusted, fig.cap="Forecast performance (using vintage data), summarized by geometric mean."}
 plotter(fcasts_honest, 
         "wis", GeoMean, scaler = "strawman_wis", 
         order_of_operations = c("scale","aggr")) +
-  ylab("Geometric mean of WIS (relative to baseline)")
+  ylab("Geometric mean WIS (relative to baseline)")
 ```
 
+\clearpage
+
+<!-- Comparison to the Hub -->
+
+```{r compare-to-hub, fig.cap="Forecast performance for AR and indicator models, each retrained at the state level, compared to models submitted to the COVID-19 Forecast Hub over the same period. The thin grey lines are individual models from the Hub; the light blue line is the Hub ensemble model. To align prediction dates as best as possible, we look at the AR and indicator model forecasts for 5, 12, 19, and 26 days ahead; this roughly corresponds to 1, 2, 3, and 4 weeks ahead, respectively, since in the Hub, models typically submit forecasts on a Tuesday for the epiweeks aligned to end on each of the following 4 Saturdays."}
+knitr::include_graphics("fig/compare-states-to-hub.pdf")
+```
 
 \clearpage
 
 <!-- Statistical significance -->
 
-```{r sign-test, fig.cap="P-values for a sign test that the WIS of the AR forecaster is smaller than that of the indicator-assisted model. Each P-value corresponds a particular forecast date."}
+```{r sign-test, fig.cap="P-values from a one-sided sign test for improved forecast performance of the indicator-assisted models. Each p-value corresponds to a forecast date. The alternative hypothesis is that the AR model is better (median difference between the relative WIS of the AR model and an indicator model is negative)."}
 fcast_colors2 <- fcast_colors[names(fcast_colors) != "AR"]
 
 st <- fcasts_honest %>%
@@ -266,19 +251,15 @@ ggplot(st, aes(p)) +
   scale_y_continuous(
     labels = scales::percent_format(scale = 1, accuracy = 1)) +
   theme_bw() +
-  theme(legend.position = "bottom")
-
+  theme(legend.position = "bottom", legend.title = element_blank())
 ```
 
-
 ```{r dm-test}
 errs <- fcasts_honest %>% 
   group_by(forecaster, forecast_date) %>%
   summarise(
-    `Mean Relative WIS` = mean(wis) / mean(strawman_wis),
-    # mwis = mean(wis), 
-    `Geometric Mean Relative WIS` = GeoMean(wis / strawman_wis) #,
-    # gmwis = GeoMean(wis)
+    `Mean` = mean(wis) / mean(strawman_wis),
+    `Geometric mean` = GeoMean(wis) / GeoMean(strawman_wis),
     ) %>%
   ungroup()
 ar <- errs %>% 
@@ -287,22 +268,21 @@ ar <- errs %>%
   pivot_longer(-forecast_date, names_to = "metric", values_to = "AR")
 notar <- errs %>%
   filter(forecaster != "AR") %>% 
-  pivot_longer(-c("forecaster", "forecast_date"), names_to = "metric", values_to = "err")
+  pivot_longer(-c("forecaster", "forecast_date"), 
+               names_to = "Aggregation metric",
+               values_to = "err")
 
 all_errs <- left_join(notar, ar)
 dms <- all_errs %>% 
   group_by(forecaster, metric) %>% 
   summarise(dm = forecast::dm.test(AR, err, "greater", h = 7 , power = 1)$p.value)
 
-
-
 kableExtra::kable(
   bind_rows(dms) %>% 
     pivot_wider(names_from = forecaster, values_from = dm), 
-  digits = 3, booktabs = TRUE, caption = "$P$-values for a one-sided Diebold-Mariano test for improvement in forecast error. The test is for the null hypothesis of equal performance against the alternative that the indicator assisted model is better.")
+  digits = 3, booktabs = TRUE, caption = "P-values from a one-sided Diebold-Mariano test for improvemed forecast performance when adding the indicators. The alternative hypothesis is that the AR model is better.")
 ```
 
-
 <!-- Bootstrap stuff -->
 
 ```{r bootstrap-loading}
@@ -343,24 +323,18 @@ plotter(fcasts_booted,
 plotter(fcasts_booted, 
         "wis", GeoMean, scaler = "strawman_wis", 
         order_of_operations = c("scale","aggr")) +
-  ylab("Geometric mean of WIS (relative to baseline)")
+  ylab("Geometric mean WIS (relative to baseline)")
 ```
 
 \clearpage
 
 ```{r hot-booted, fig.cap = "Hotspot prediction performance when indicators are replaced with samples from their empirical distribution. Performance is largely similar to the AR model."}
 plotter_hotspots(hotspots_booted) + 
-  geom_hline(yintercept = 0.5)
+  geom_hline(yintercept = 0.5, linetype = "dashed")
 ```
 
-
-
 \clearpage
 
-
-
-
-
 <!-- Upswings vs. downswings -->
 
 ```{r up-down-processing}
@@ -410,7 +384,7 @@ up_down_df <- left_join(corr_df %>%
   inner_join(up_down, by = c("geo_value","target_end_date"))
 ```
 
-```{r upswing-summary, fig.cap="Average difference between the WIS of the AR model and the WIS of the other forecasters. The indicator-assisted forecasters do best during down and flat periods."}
+```{r upswing-summary, fig.cap="Average difference between the WIS of the AR model and of the indicator models, separated into up, down, and flat periods. The indicator models generally do best during down and flat periods."}
 up_down_df_summary <- up_down_df %>%
   group_by(forecaster, udf) %>%
   summarise(mean = Mean(AR_wis - wis))
@@ -418,17 +392,15 @@ up_down_df_summary %>%
   ggplot(aes(udf, mean , fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  scale_fill_manual(values = fcast_colors[1:5], guide = guide_legend(nrow = 1)) +
+  scale_fill_manual(values = fcast_colors2, guide = guide_legend(nrow = 1)) +
   scale_y_continuous() +
   theme_bw() +
-  geom_hline(yintercept = 0) +
-  ylab("Mean of WIS(AR) - WIS(F)") +
+  geom_hline(yintercept = 0, linetype = "dashed") +
+  ylab("Mean of AR WIS - forecaster WIS") +
   xlab("") +
   theme(legend.position = "bottom", legend.title = element_blank())
 ```  
 
-  
-
 ```{r upswing-histogram, include=FALSE, fig.width=9, fig.height=4.5}
 # Fgure 4
 up_down_df %>% 
@@ -447,28 +419,9 @@ up_down_df %>%
   geom_vline(xintercept = 0)
 ```
 
-```{r upswing-histogram-logged, eval = FALSE, fig.cap="Not sure if we want this here. Similar to Figure 5 in the manuscript but taking logs. "}
-up_down_df %>% 
-  group_by(forecaster, udf) %>%
-  ggplot(aes((AR_wis + 1) / (wis + 1), fill = forecaster)) +
-  geom_histogram(bins = 100) +
-  facet_grid(udf ~ forecaster) +
-  theme_bw() +
-  ylab("Count") +
-  theme(legend.position = "none") +
-  scale_fill_manual(values = fcast_colors) +
-  scale_x_log10() +
-  scale_y_log10(breaks = c(10,1000,100000), 
-                labels = trans_format("log10", math_format(10^.x))) +
-  xlab("AR WIS - forecaster WIS") +
-  geom_vline(xintercept = 1)
-```
-
 \clearpage
 
-
-
-```{r hotspots-upswing-downswing, fig.cap="Classification and loglikelihood separated into periods of upswing, downswing, and flat cases. Like the analysis of the forecasting task in the main paper (see Figure 7), relative performance is better during down and flat periods."}
+```{r hotspots-upswing-downswing, fig.cap="Percentage change in classification error and log likelihood, relative that of the AR model, separated into up, down, and flat periods. Like the analogous forecasting analysis, the indicator models generally do better during down and flat periods."}
 hot_udf <- inner_join(
   hotspots_honest,
   up_down %>% select(geo_value, target_end_date, udf))
@@ -512,108 +465,16 @@ bind_rows(con_tab %>% mutate(err = "Classification error"),
   ggplot(aes(udf, (m - mar) / abs(mar) , fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  scale_fill_manual(values = fcast_colors[1:5], guide = guide_legend(nrow = 1)) +
+  scale_fill_manual(values = fcast_colors2, guide = guide_legend(nrow = 1)) +
   scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
   theme_bw() +
-  geom_hline(yintercept = 0) +
+  geom_hline(yintercept = 0, linetype = "dashed") +
   facet_wrap(~err) +
-  ylab("Change relative to AR") +
-  xlab("") +
-  theme(legend.position = "bottom", legend.title = element_blank())
-```
-
-```{r hotspots-upswing-downswing-remake, fig.cap="REMAKE: Classification and loglikelihood separated into periods of upswing, downswing, and flat cases. The indicator assisted models always have lower classification error relative to the null classifier, while the AR actually does worse in down and flat periods. In terms of loglikelihood, all forecasters have lower likelihood than the null classifier during up periods. In down and flat periods, the indicators generally improve over the AR, while the are worse during up periods.", eval=FALSE}
-# mycummean <- function(x) {
-#   isgood <- ! is.na(x)
-#   denom <- cumsum(isgood)
-#   x[!isgood] <- 0
-#   num <- cumsum(x)
-#   y <- num / denom
-#   y[is.na(y)] <- .5 # deal with empties at the beginning
-#   y
-# }
-# 
-# null_classifier <- readRDS(here("data", 
-#                                 "all_signals_wide_as_of_2020-05-18.RDS")) %>%
-#   rename(jhu = `value+0:jhu-csse_confirmed_7dav_incidence_prop`) %>%
-#   select(geo_value, time_value, jhu) %>%
-#   group_by(geo_value) %>%
-#   arrange(time_value) %>%
-#   mutate(lag_value = lag(jhu, 7),
-#          apc = (jhu - lag_value) / lag_value,
-#          hot = case_when(
-#            apc > .25 & lag_value > 30 ~ 1,
-#            apc <= .25 & lag_value > 30 ~ 0),
-#          null_class = mycummean(hot))
-  
-hot_udf <- inner_join(
-  hotspots_honest,
-  up_down %>% select(geo_value, target_end_date, udf))
-
-cutoff <- mean(hot_udf %>% filter(forecaster == "AR") %>% pull(actual))
-
-con_tab <- hot_udf %>% 
-  filter(!is.na(actual)) %>%
-  mutate(pred = value > .5) %>%
-  group_by(forecaster, udf) %>%
-  summarise(m = (mean(pred != actual) - cutoff) / cutoff,
-            err = mean(pred != actual)) %>%
-  ungroup()
-
-# con_tab <- left_join(
-#   con_tab %>% filter(forecaster != "AR"),
-#   con_tab %>% filter(forecaster == "AR") %>%
-#     select(-forecaster) %>%
-#     rename(mar = m))
-
-llike_tab <- hot_udf %>%
-  filter(!is.na(actual)) %>%
-  # left_join(null_classifier %>% select(geo_value, time_value, null_class) %>%
-  #             rename(target_end_date = time_value)) %>%
-  mutate( # kill prob 0-1 predictions
-    value = case_when(
-      value < 1e-8 ~ 1e-8,
-      value > 1-1e-8 ~ 1-1e-8,
-      TRUE ~ value),
-    # null_class = case_when(
-    #   null_class < 1e-8 ~ 1e-8,
-    #   null_class > 1-1e-8 ~ 1-1e-8,
-    #   TRUE ~ null_class),
-    llike = (actual == 1) * log(value) + (actual == 0) * log(1 - value),
-    nulldev = (actual == 1) * log(cutoff) + 
-      (actual == 0) * log(1 - cutoff)) %>%
-  group_by(forecaster, udf) %>%
-  summarise(m = (mean(llike) - mean(nulldev)) / abs(mean(nulldev)),
-            mm = mean(llike),
-            mmm = mean(nulldev)) %>%
-  ungroup()
-
-# llike_tab <- left_join(
-#   llike_tab %>% filter(forecaster != "AR"),
-#   llike_tab %>% filter(forecaster == "AR") %>%
-#     select(-forecaster) %>%
-#     rename(mar = m))
-
-bind_rows(con_tab %>% mutate(err = "Classification error"), 
-          llike_tab %>% mutate(err = "Log likelihood")) %>%
-  ggplot(aes(udf, m , fill = forecaster)) +
-  geom_bar(width = 0.6, position = position_dodge(width=0.6),
-           stat = "identity") +
-  scale_fill_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
-  #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
-  theme_bw() +
-  geom_hline(yintercept = 0) +
-  scale_y_continuous(labels = scales::percent_format()) +
-  facet_wrap(~err, scales = "free_y") +
-  ylab("Improvement relative to the null classifier") +
+  ylab("% change relative to AR") +
   xlab("") +
   theme(legend.position = "bottom", legend.title = element_blank())
 ```
 
-\clearpage
-
-
-
 <!-- Correlations with lagged actuals -->
 
 ```{r cor-with-actuals}
@@ -645,7 +506,7 @@ pct_chng_df <- comparison_df %>%
       abs(wis_ratio - 1), pct_change,method = "spearman"), .groups = "drop")
 ```
 
-```{r cor-wis-ratio, fig.cap="Histograms of the Spearman correlation between the ratio of AR to AR WIS with the percent change in smoothed case rates relative to 7 days earlier."}
+```{r cor-wis-ratio, fig.cap="Histograms of the Spearman correlation between the ratio of forecaster WIS to AR WIS with the \\% change in case rates, relative to case rates 7 days earlier."}
 ggplot(pct_chng_df %>% 
          group_by(forecaster) %>% 
          mutate(median = median(cor_wisratio_pctchange)), 
@@ -657,64 +518,28 @@ ggplot(pct_chng_df %>%
   geom_vline(aes(xintercept = median), linetype = "dotted") + 
   geom_vline(xintercept = 0) +
   facet_wrap(~forecaster) + 
-  xlab("Spearman correlation") + ylab("Relative frequency") +
+  xlab("Spearman correlation") + ylab("Frequency") +
   scale_y_continuous(labels = scales::label_percent(accuracy = .1))
 ```
 
 \clearpage
 
-```{r cor-wis-ratio-m1, eval = FALSE, fig.cap="This is Alden's second set of histograms. Here we have the correlation of the absolute value of WIS ratio - 1 with the percent change in 7dav cases relative to 7 days earlier"}
-ggplot(pct_chng_df %>% 
-         group_by(forecaster) %>% 
-         mutate(median = median(cor_abs_wisratio_minus_1_pctchange)), 
-       aes(x = cor_abs_wisratio_minus_1_pctchange)) +
-  geom_histogram(aes(y = ..count.. / sum(..count..), fill = forecaster), bins = 50) +
-  scale_fill_manual(values = fcast_colors) +
-  theme_bw() +
-  theme(legend.position = "none") +
-  geom_vline(aes(xintercept = median), linetype = "dotted") + 
-  geom_vline(xintercept = 0) +
-  facet_wrap(~forecaster) + 
-  xlab("Spearman correlation") + ylab("Relative frequency") +
-  scale_y_continuous(labels = scales::label_percent(accuracy = 1))
-```
-
-\clearpage
-
-
-```{r upswing-corr-table, fig.cap="Correlation of the difference in WIS with the difference in median predictions for the AR model relative to the indicator-assisted forecaster. In down periods, improvements in forecast risk are highlycorrelated with lower median predictions. The opposite is true in up periods. This suggests, as one might expect that improved performance of the indicator-assisted model is attributable to being closer to the truth then the AR model. This conclusion is stronger in down periods then in up periods."}
+```{r upswing-corr-table, fig.cap="Correlation of the difference in WIS with the difference in median predictions (each difference being between the AR model and an indicator model), separated into up, down, and flat periods. In down periods, improvements in forecast error are highly correlated with lower median predictions. The opposite is true in up periods, but the conclusion here appears to be weaker."}
 up_down_df %>% 
   group_by(forecaster, udf) %>%
   summarise(cor = cor(AR_wis - wis, AR_med - med)) %>%
   ggplot(aes(udf, cor , fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  scale_fill_manual(values = fcast_colors[1:5], guide = guide_legend(nrow = 1)) +
-  theme_bw() +
-  geom_hline(yintercept = 0) +
+  scale_fill_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
+  theme_bw() + ylab("Correlation") + xlab("") +
+  geom_hline(yintercept = 0, linetype = "dashed") +
   theme(legend.position = "bottom", legend.title = element_blank())
-
-# up_down_df %>% 
-#   filter(period != "jm") %>% 
-#   group_by(forecaster, udf) %>%
-#   summarise(cor = cor(AR_wis - wis, AR_med - med)) %>%
-#   pivot_wider(names_from = forecaster, values_from = cor) %>%
-#   kableExtra::kbl(
-#     booktabs = TRUE, digits = 2, centering = TRUE,
-#     caption = paste("Correlation of the difference in WIS between the AR",
-#                    "model with the difference in median predictions. In down",
- #                   "periods, improvements in forecast risk are highly",
-#                    "correlated with lower median predictions. The opposite",
-#                    "is true in up periods. This suggests, as one might expect",
-#                    "that improved performance of the indicator-assisted",
-#                    "model is attributable to being closer to the truth then",
-#                    "the AR model. This conclusion is stronger in down",
-#                    "periods then in up periods."))
 ```
 
 \clearpage
 
-```{r upswing-summary-remake, fig.cap="Percent change in average WIS of the forecaster (AR or indicator assisted) relative to the baseline. All models perform poorly durning down periods, but the indicators help. During flat periods, the indicators improve slightly over the AR. During up periods, all forecasters do much better than the baseline, but only some do as well as AR."}
+```{r upswing-summary-remake, fig.cap="Percentage change in average WIS of the forecaster (AR or indicator assisted), relative to the baseline. All models perform poorly during down periods, but the indicators help. During flat periods, the indicators improve slightly over the AR. During up periods, all forecasters do much better than the baseline, but only some do as well as AR."}
 up_down_df_summary <- inner_join(
   left_join(
     corr_df, 
@@ -729,14 +554,12 @@ up_down_df_summary %>%
   scale_fill_manual(values = fcast_colors, guide = guide_legend(nrow = 1)) +
   theme_bw() +
   scale_y_continuous(labels = scales::percent_format()) +
-  geom_hline(yintercept = 0) +
-  ylab("% change in Forecaster WIS\n relative to Baseline") +
+  geom_hline(yintercept = 0, linetype = "dashed") +
+  ylab("% change in WIS relative to baseline") +
   xlab("") +
   theme(legend.position = "bottom", legend.title = element_blank())
 ```  
 
-
-
 \clearpage
 
 <!-- Leadingness and laggingness -->
@@ -810,7 +633,6 @@ df2 <- left_join(
   by = c("geo_value","forecaster","target_end_date" = "time_value"))
 ```
 
-
 ```{r leading-only, include=FALSE, fig.width=6, fig.height=4}
 # Figure 5 in the manuscript
 df2 %>%
@@ -821,8 +643,8 @@ df2 %>%
   ggplot(aes(udf, leadingness, fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  geom_hline(yintercept = 0) +
-  scale_fill_manual(values = fcast_colors[1:5]) +
+  geom_hline(yintercept = 0, linetype = "dashed") +
+  scale_fill_manual(values = fcast_colors2) +
   guides(fill = guide_legend(nrow = 1)) +
   theme_bw() +
   ylab("Correlation") +
@@ -835,38 +657,14 @@ df2 %>%
         legend.background = element_rect(fill = "transparent"))
 ```
 
-
-
-```{r leading-and-lagging, eval=FALSE}
-# Deprecated
-df2 %>%
-  group_by(forecaster, udf) %>%
-  summarise(
-    leadingness = cor(AR_wis - wis, leading, use = "pairwise.complete.obs"),
-    laggingness = cor(AR_wis - wis, lagging, use = "pairwise.complete.obs")) %>%
-  pivot_longer(leadingness:laggingness) %>%
-  ggplot(aes(udf, value, fill = forecaster)) +
-  geom_bar(width = 0.6, position = position_dodge(width=0.6),
-           stat = "identity") +
-  geom_hline(yintercept = 0) +
-  scale_fill_manual(values = fcast_colors) +
-  facet_wrap(~name) +
-  theme_bw() +
-  ylab("Correlation") +
-  xlab("") +
-  theme(legend.position = "bottom", legend.title = element_blank())
-```
-
-```{r ccf-dv-finalized, eval=TRUE, fig.cap="Illustration of the cross-correlation function between DV-CLI and cases. The left panel shows the standardized signals over the period from August 1 to September 28 (as of May 15, 2021). The right panel shows $\\CCF_{\\ell}(a)$ for different values of $a$ as vertical blue bars. The orange dashed lines indicate the 95\\% significance threshold. By our leadingness/laggingness metric, DV-CLI is leading (but notlagging) cases over this period."}
+```{r ccf-dv-finalized, fig.cap="Illustration of the cross-correlation function between DV-CLI and cases. The left panel shows the standardized signals over the period from August 1 to September 28 (as of May 15, 2021). The right panel shows $\\CCF_{\\ell}(a)$ for different values of $a$ as vertical blue bars. The orange dashed line indicates the 95\\% significance threshold. By our leadingness/laggingness metric, DV-CLI is leading (but not lagging) cases over this period."}
 source(here("code", "figures", "ccf-dv-finalized.R"))
 gg
 ```
 
 \clearpage
 
-
-```{r lagging-only, fig.cap="Correlation of the difference in WIS with the  laggingness of the indicator at the target date, stratified by up, down, or flat period. Compare to Figure 5 in the manuscript."}
-# Deprecated
+```{r lagging-only, fig.cap="Correlation of the difference in WIS with the laggingness of the indicator at the target date, stratified by up, down, or flat period. Compare to Figure 5 in the manuscript."}
 df2 %>%
   group_by(forecaster, udf) %>%
   summarise(
@@ -875,8 +673,8 @@ df2 %>%
   ggplot(aes(udf, laggingness, fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  geom_hline(yintercept = 0) +
-  scale_fill_manual(values = fcast_colors[1:5], guide = guide_legend(nrow = 1)) +
+  geom_hline(yintercept = 0, linetype = "dashed") +
+  scale_fill_manual(values = fcast_colors2, guide = guide_legend(nrow = 1)) +
   theme_bw() +
   ylab("Correlation") +
   xlab("") +
@@ -886,7 +684,6 @@ df2 %>%
 \clearpage
 
 ```{r diff-in-lead-lag, fig.cap = "Correlation of the difference between leadingness and laggingness with the difference in WIS. The relationship is essentially the same as described in the manuscript and shown in Figure 5."}
-# Deprecated
 df2 %>%
   group_by(forecaster, udf) %>%
   summarise(cor = cor(AR_wis - wis, leading - lagging, 
@@ -894,62 +691,41 @@ df2 %>%
   ggplot(aes(udf, cor, fill = forecaster)) +
   geom_bar(width = 0.6, position = position_dodge(width=0.6),
            stat = "identity") +
-  geom_hline(yintercept = 0) +
-  scale_fill_manual(values = fcast_colors[1:5]) +
+  geom_hline(yintercept = 0, linetype = "dashed") +
+  scale_fill_manual(values = fcast_colors2) +
   theme_bw() +
   ylab("Correlation") +
   xlab("") +
   theme(legend.position = "bottom", legend.title = element_blank())
 ```
 
-
-
 \clearpage
 
-
-
-
-
 <!-- Figures in the manuscript in their own scripts -->
 
-
 ```{r revisions-dv-jhu, eval=FALSE}
 # Figure 1 in the manuscript
 source(here("code", "figures", "revisions.R"))
 ```
 
-
-
-
-
 ```{r ny-trajectory, eval=FALSE}
 # Figure 2 in the manuscript
 source(here("code", "figures", "trajectory.R"))
-gg
 ```
 
-
-
-
-
-
-
-
-
 \clearpage
 
-
 <!-- Cumulative Mean -->
 
-```{r cumulative-mean, fig.cap="This figure shows the cumulative sum of WIS for each forecaster divided by the cumulative sum of WIS for the baseline model. The shaded background shows total U.S. cases as reported by JHU-CSSE for the 14-day ahead target. Hashes along the $x$-axis denote weeks." }
+```{r cumulative-mean, fig.cap="Cumulative sum of WIS for each forecaster divided by the cumulative sum of WIS for the baseline model. The shaded background shows national case incidence for the 14-day ahead target. Hash marks along the x-axis denote weeks." }
 cumulatives <- fcasts_honest %>%
   group_by(forecaster, forecast_date) %>%
   summarise(mw = Mean(wis),
             smw = Mean(strawman_wis),
             gmw = GeoMean(wis / strawman_wis)) %>%
   arrange(forecast_date) %>%
-  mutate(`Cumulative Mean` = cummean(mw) / cummean(smw),
-         `Cumulative Sum` = cumsum(mw) / cumsum(smw),
+  mutate(`Cumulative mean` = cummean(mw) / cummean(smw),
+         `Cumulative sum` = cumsum(mw) / cumsum(smw),
          `Cumulative Geo Mean` = exp(cummean(log(gmw))),
          `GM Regret` = cumsum(gmw) - cumsum(smw/smw),
          `AM Regret` = cumsum(mw) - cumsum(smw),
@@ -964,13 +740,13 @@ ticks <- seq(min(us_cases$time_value), max(us_cases$time_value), by = 7) - 14
 
 
 ggplot(cumulatives %>% filter(forecast_date < "2021-01-01")) + 
-  geom_line(aes(forecast_date, y = `Cumulative Sum`, color = forecaster)) +
+  geom_line(aes(forecast_date, y = `Cumulative sum`, color = forecaster)) +
   annotate(x = us_cases$time_value - 14, ymax = us_cases$value, ymin = -Inf,
            geom = "ribbon", alpha = .1) +
   annotate("segment", x=ticks, y=-Inf, xend = ticks, yend = .77, 
                color = "grey70") +
-  geom_hline(yintercept = 1) +
-  xlab("forecast date") +
+  geom_hline(yintercept = 1, linetype = "dashed") +
+  xlab("Forecast date") + ylab("Cumulative sum") +
   coord_cartesian(#ylim = c(.76, 1), 
                   xlim = ymd(c("2020-06-01", "2021-01-01"))) +
   scale_x_date(date_breaks = "2 months", date_minor_breaks = "1 week",
@@ -983,15 +759,15 @@ ggplot(cumulatives %>% filter(forecast_date < "2021-01-01")) +
 
 \clearpage
 
-```{r cumulative-geo-mean, fig.cap="This figure shows the cumulative geometric mean of WIS for each forecaster divided by the WIS for the baseline model. The shaded background shows total U.S. cases as reported by JHU-CSSE for the 14-day ahead target. Hashes along the $x$-axis denote weeks." }
+```{r cumulative-geo-mean, fig.cap="Cumulative geometric mean of WIS for each forecaster divided by WIS for the baseline model. The shaded background shows national case incidence for the 14-day ahead target. Hashes along the x-axis denote weeks." , eval=FALSE}
 ggplot(cumulatives %>% filter(forecast_date < "2021-01-01")) + 
   geom_line(aes(forecast_date, y = `Cumulative Geo Mean`, color = forecaster)) +
   annotate(x = us_cases$time_value - 14, ymax = us_cases$value, ymin = -Inf,
            geom = "ribbon", alpha = .1) +
   annotate("segment", x=ticks, y=-Inf, xend = ticks, yend = .77, 
                color = "grey70") +
-  geom_hline(yintercept = 1) +
-  xlab("forecast date") + ylab("Cumulative Geometric Mean") +
+  geom_hline(yintercept = 1, linetype = "dashed") +
+  xlab("Forecast date") + ylab("Cumulative geo mean") +
   coord_cartesian(#ylim = c(.76, 1), 
                   xlim = ymd(c("2020-06-01", "2021-01-01"))) +
   scale_x_date(date_breaks = "2 months", date_minor_breaks = "1 week",
@@ -1004,7 +780,7 @@ ggplot(cumulatives %>% filter(forecast_date < "2021-01-01")) +
 
 \clearpage
 
-```{r errs-in-space, fig.cap="Percent improvement in WIS relative to the AR forecaster by region (negative numbers indicate improved performance, positives indicate worsening)."}
+```{r errs-in-space, fig.cap="Percentage improvement in WIS, relative to the AR forecaster, by HRR (negative numbers indicate improved performance, positives indicate worsening)."}
 # fig.height = 9.6, fig.width = 6, out.width = "5in", out.height="8in", 
 by_hrr <- fcasts_honest %>% 
   group_by(forecaster, geo_value) %>% 
@@ -1041,137 +817,4 @@ maps <- purrr::map(
 map_plot <- cowplot::plot_grid(plotlist = maps, ncol = 3)
 map_leg <- get_legend(maps[[1]] + theme(legend.position = "bottom"))
 cowplot::plot_grid(map_plot, map_leg, ncol = 1, rel_heights = c(1, .1))
-```
-
-\clearpage
-
-<!-- Comparison to submitted forecasts -->
-
-```{r grab-submitted-case-fcasts}
-library(aws.s3)
-Sys.setenv("AWS_DEFAULT_REGION" = "us-east-2")
-s3bucket <- get_bucket("forecast-eval")
-# ensure that forecasters have submitted a reasonable amount
-n_keeper_weeks <- 6L
-n_keeper_locs <- 50L
-case_scores <- s3readRDS("score_cards_state_cases.rds", s3bucket) %>% 
-  mutate(forecast_date = target_end_date - ahead * 7) %>% 
-  # fix weirdnesses about submission dates, 
-  # this is generous to the forecaster
-  filter(forecast_date < "2021-01-01", ahead < 4) %>% 
-  select(ahead, geo_value, forecaster, target_end_date, wis, forecast_date)
-
-strawman <- case_scores %>% filter(forecaster == "COVIDhub-baseline")
-case_scores <- left_join(
-  case_scores, 
-  strawman %>% 
-    select(forecast_date, target_end_date, geo_value, wis) %>% 
-    rename(strawman_wis = wis)
-) %>% filter(forecaster != "COVIDhub-baseline")
-
-# Subset to those forecasters that submitted enough 14-day ahead forecasts
-n_submitted <- case_scores %>% 
-  filter(ahead == 2) %>% 
-  group_by(forecaster) %>% 
-  summarise(nfcasts = n())
-keepers <- n_submitted %>% 
-  filter(nfcasts / n_keeper_locs > n_keeper_weeks - .0001, 
-         forecaster != "COVIDhub-4_week_ensemble") %>% # same as CH-Ensemble 
-  pull(forecaster)
-
-case_scores <- case_scores %>% 
-  filter(forecaster %in% keepers) %>%
-  group_by(forecaster, ahead) %>%
-  summarise(rel_wis = Mean(wis) / Mean(strawman_wis),
-            geo_wis1 = GeoMean((wis + 1) / (strawman_wis + 1)),
-            geo_wis = GeoMean(wis / strawman_wis))
-```
-
-```{r compare-to-hub-mean, fig.height = 8.5, fig.cap="This figure reproduces Figure 3 in the main paper but overlays scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey lines correspond to the various teams that submitted during period our evaluation period. We have highlighted the COVIDhub-ensemble, which is the official forecast of the CDC.", include=FALSE}
-# Same as Fig 3 in the paper
-df <- summarizer(fcasts_honest %>%
-                   group_by(ahead, forecaster), "wis", NULL, 
-                 "strawman_wis", Mean, c("aggr","scale"))
-ggplot(df) +
-  geom_line(aes(ahead, wis, color = forecaster)) + 
-  geom_point(aes(ahead, wis, color = forecaster)) +
-  theme_bw() +
-  geom_hline(yintercept = 1, size = 1.5) +
-  xlab("Days ahead") +
-  ylab("Mean WIS (relative to baseline)") +
-  scale_color_manual(values = c(fcast_colors, "CH-ensemble" = "lightblue"),
-                     guide = guide_legend(nrow = 2)) +
-  theme(legend.position = "bottom", legend.title = element_blank()) +
-  geom_line(data = case_scores, 
-            aes(ahead * 7, rel_wis, group = forecaster),
-            color = "grey70") +
-  geom_line(data = case_scores %>% filter(forecaster == "COVIDhub-ensemble"),
-            aes(ahead * 7, rel_wis), color = "lightblue", size = 1.5) +
-  scale_y_log10()
-```
-
-\clearpage
-
-```{r compare-to-hub-geomean, fig.height = 8.5, fig.cap="This figure is similar to Figure \\ref{fig:fcast-adjusted}. In this case, we add 1 to both the forecaster WIS and the baseline WIS before scaling (to allow forecasters that achieve 0 error to appear), and we overlay scores for the forecasts submitted to the COVID-19 Forecast Hub. Grey lines correspond to the various teams that submitted during period our evaluation period. We have highlighted the COVIDhub-ensemble, which is the official forecast of the CDC.", include=FALSE}
-# Like supplement GeoMean figure, but adding 1
-df2 <- fcasts_honest %>% 
-  group_by(ahead, forecaster) %>%
-  summarise(wis = GeoMean((wis + 1) / (strawman_wis + 1)))
-
-ggplot(df2) +
-  geom_line(aes(ahead, wis, color = forecaster)) + 
-  geom_point(aes(ahead, wis, color = forecaster)) +
-  theme_bw() +
-  geom_hline(yintercept = 1, size = 1.5) +
-  xlab("Days ahead") +
-  ylab("Geometric mean of WIS (relative to baseline)") +
-  scale_color_manual(values = c(fcast_colors, "CH-ensemble" = "lightblue"),
-                     guide = guide_legend(nrow = 2)) +
-  theme(legend.position = "bottom", legend.title = element_blank()) +
-  geom_line(data = case_scores, 
-            aes(ahead * 7, geo_wis1, group = forecaster),
-            color = "grey70") +
-  geom_line(data = case_scores %>% filter(forecaster == "COVIDhub-ensemble"),
-            aes(ahead * 7, geo_wis1), color = "lightblue", size = 1.5) +
-  scale_y_log10()
-```
-
-
-<!-- All code below is deprecated -->
-
-
-```{r fcast-gs-locations, eval=FALSE}
-plotter(fcasts_gs %>% filter(period != "jm"),
-        "wis", Mean, scaler = "strawman_wis", 
-        order_of_operations = c("aggr","scale")) +
-  ylab("Mean WIS (relative to baseline)")
-```
-
-```{r fcast-gs-locations-adjusted, eval=FALSE}
-plotter(fcasts_gs %>% filter(period != "jm"), 
-        "wis", GeoMean, scaler = "strawman_wis", 
-        order_of_operations = c("scale","aggr")) +
-  ylab("Geometric mean of WIS (relative to baseline)")
-```
-
-```{r hot-gs-locations, eval=FALSE}
-plotter_hotspots(hotspots_gs %>% filter(period != "jm")) +
-  geom_hline(yintercept = 0.5)
-```
-
-
-
-```{r traj-data, eval = FALSE}
-source(here("code", "deprecated", "trajectory_plot_funs.R"))
-preds <- readRDS(here("data", "predictions_honest.RDS"))
-traj_best <- get_trajectory_plots(fcasts_honest, preds, actuals, hrr_tab, 
-                                  "only2020", "best")
-traj_worst <- get_trajectory_plots(fcasts_honest, preds, actuals, hrr_tab,
-                                   "only2020", "worst")
-rm(preds)
-```
-
-```{r trajectory-plots, eval = FALSE}
-for (nm in names(traj_best)) print(trajectory_panel(nm, traj_best, traj_worst))
-```
-
+```
\ No newline at end of file
diff --git a/forecast/paper/supplement.pdf b/forecast/paper/supplement.pdf
new file mode 100644
index 0000000..18dafe1
Binary files /dev/null and b/forecast/paper/supplement.pdf differ
diff --git a/forecast/paper/supplement.tex b/forecast/paper/supplement.tex
index 2327234..caced45 100644
--- a/forecast/paper/supplement.tex
+++ b/forecast/paper/supplement.tex
@@ -56,7 +56,7 @@
 
 }
 
-\caption{Relative forecast WIS with vintage compared to finalized data. Using finalized data leads to overly optimistic performance.}\label{fig:fcast-honest-v-finalized}
+\caption{Forecast performance with vintage compared to finalized data. Using finalized data leads to overly optimistic performance.}\label{fig:fcast-honest-v-finalized}
 \end{figure}
 
 \clearpage
@@ -67,7 +67,7 @@
 
 }
 
-\caption{Relative AUC with vintage compared to finalized data. Using finalized data leads to overly optimistic hotspot performance.}\label{fig:hot-honest-v-finalized}
+\caption{Hotspot prediction performance with vintage compared to finalized data. Using finalized data leads to overly optimistic performance.}\label{fig:hot-honest-v-finalized}
 \end{figure}
 
 \clearpage
@@ -78,7 +78,7 @@
 
 }
 
-\caption{Weighted interval score appears to more closely resemble a log-Gaussian distribution.}\label{fig:wis-densities}
+\caption{WIS values from forecast models, which appear to be roughly log-Gaussian.}\label{fig:wis-densities}
 \end{figure}
 
 \clearpage
@@ -89,7 +89,18 @@
 
 }
 
-\caption{Relative forecast performance using vintage data and summarizing with the more robust geometric mean.}\label{fig:fcast-adjusted}
+\caption{Forecast performance (using vintage data), summarized by geometric mean.}\label{fig:fcast-adjusted}
+\end{figure}
+
+\clearpage
+
+\begin{figure}
+
+{\centering \includegraphics[width=\textwidth]{fig/compare-states-to-hub} 
+
+}
+
+\caption{Forecast performance for AR and indicator models, each retrained at the state level, compared to models submitted to the COVID-19 Forecast Hub over the same period. The thin grey lines are individual models from the Hub; the light blue line is the Hub ensemble model. To align prediction dates as best as possible, we look at the AR and indicator model forecasts for 5, 12, 19, and 26 days ahead; this roughly corresponds to 1, 2, 3, and 4 weeks ahead, respectively, since in the Hub, models typically submit forecasts on a Tuesday for the epiweeks aligned to end on each of the following 4 Saturdays.}\label{fig:compare-to-hub}
 \end{figure}
 
 \clearpage
@@ -100,19 +111,19 @@
 
 }
 
-\caption{P-values for a sign test that the WIS of the AR forecaster is smaller than that of the indicator-assisted model. Each P-value corresponds a particular forecast date.}\label{fig:sign-test}
+\caption{P-values from a one-sided sign test for improved forecast performance of the indicator-assisted models. Each p-value corresponds to a forecast date. The alternative hypothesis is that the AR model is better (median difference between the relative WIS of the AR model and an indicator model is negative).}\label{fig:sign-test}
 \end{figure}
 
 \begin{table}
 
-\caption{\label{tab:dm-test}$P$-values for a one-sided Diebold-Mariano test for improvement in forecast error. The test is for the null hypothesis of equal performance against the alternative that the indicator assisted model is better.}
+\caption{\label{tab:dm-test}P-values from a one-sided Diebold-Mariano test for improvemed forecast performance when adding the indicators. The alternative hypothesis is that the AR model is better.}
 \centering
 \begin{tabular}[t]{lrrrrr}
 \toprule
 metric & CHNG-CLI & CHNG-COVID & CTIS-CLIIC & DV-CLI & Google-AA\\
 \midrule
-Geometric Mean Relative WIS & 0.116 & 0.067 & 0.000 & 0.059 & 0.008\\
-Mean Relative WIS & 0.159 & 0.111 & 0.002 & 0.074 & 0.064\\
+Geometric mean & 0.072 & 0.036 & 0.005 & 0.032 & 0.026\\
+Mean & 0.177 & 0.132 & 0.006 & 0.092 & 0.103\\
 \bottomrule
 \end{tabular}
 \end{table}
@@ -156,7 +167,7 @@
 
 }
 
-\caption{Average difference between the WIS of the AR model and the WIS of the other forecasters. The indicator-assisted forecasters do best during down and flat periods.}\label{fig:upswing-summary}
+\caption{Average difference between the WIS of the AR model and of the indicator models, separated into up, down, and flat periods. The indicator models generally do best during down and flat periods.}\label{fig:upswing-summary}
 \end{figure}
 
 \clearpage
@@ -167,31 +178,27 @@
 
 }
 
-\caption{Classification and loglikelihood separated into periods of upswing, downswing, and flat cases. Like the analysis of the forecasting task in the main paper (see Figure 7), relative performance is better during down and flat periods.}\label{fig:hotspots-upswing-downswing}
+\caption{Percentage change in classification error and log likelihood, relative that of the AR model, separated into up, down, and flat periods. Like the analogous forecasting analysis, the indicator models generally do better during down and flat periods.}\label{fig:hotspots-upswing-downswing}
 \end{figure}
 
-\clearpage
-
 \begin{figure}
 
 {\centering \includegraphics[width=\textwidth]{fig/cor-wis-ratio-1} 
 
 }
 
-\caption{Histograms of the Spearman correlation between the ratio of AR to AR WIS with the percent change in smoothed case rates relative to 7 days earlier.}\label{fig:cor-wis-ratio}
+\caption{Histograms of the Spearman correlation between the ratio of forecaster WIS to AR WIS with the \% change in case rates, relative to case rates 7 days earlier.}\label{fig:cor-wis-ratio}
 \end{figure}
 
 \clearpage
 
-\clearpage
-
 \begin{figure}
 
 {\centering \includegraphics[width=\textwidth]{fig/upswing-corr-table-1} 
 
 }
 
-\caption{Correlation of the difference in WIS with the difference in median predictions for the AR model relative to the indicator-assisted forecaster. In down periods, improvements in forecast risk are highlycorrelated with lower median predictions. The opposite is true in up periods. This suggests, as one might expect that improved performance of the indicator-assisted model is attributable to being closer to the truth then the AR model. This conclusion is stronger in down periods then in up periods.}\label{fig:upswing-corr-table}
+\caption{Correlation of the difference in WIS with the difference in median predictions (each difference being between the AR model and an indicator model), separated into up, down, and flat periods. In down periods, improvements in forecast error are highly correlated with lower median predictions. The opposite is true in up periods, but the conclusion here appears to be weaker.}\label{fig:upswing-corr-table}
 \end{figure}
 
 \clearpage
@@ -202,7 +209,7 @@
 
 }
 
-\caption{Percent change in average WIS of the forecaster (AR or indicator assisted) relative to the baseline. All models perform poorly durning down periods, but the indicators help. During flat periods, the indicators improve slightly over the AR. During up periods, all forecasters do much better than the baseline, but only some do as well as AR.}\label{fig:upswing-summary-remake}
+\caption{Percentage change in average WIS of the forecaster (AR or indicator assisted), relative to the baseline. All models perform poorly during down periods, but the indicators help. During flat periods, the indicators improve slightly over the AR. During up periods, all forecasters do much better than the baseline, but only some do as well as AR.}\label{fig:upswing-summary-remake}
 \end{figure}
 
 \clearpage
@@ -213,7 +220,7 @@
 
 }
 
-\caption{Illustration of the cross-correlation function between DV-CLI and cases. The left panel shows the standardized signals over the period from August 1 to September 28 (as of May 15, 2021). The right panel shows $\CCF_{\ell}(a)$ for different values of $a$ as vertical blue bars. The orange dashed lines indicate the 95\% significance threshold. By our leadingness/laggingness metric, DV-CLI is leading (but notlagging) cases over this period.}\label{fig:ccf-dv-finalized}
+\caption{Illustration of the cross-correlation function between DV-CLI and cases. The left panel shows the standardized signals over the period from August 1 to September 28 (as of May 15, 2021). The right panel shows $\CCF_{\ell}(a)$ for different values of $a$ as vertical blue bars. The orange dashed line indicates the 95\% significance threshold. By our leadingness/laggingness metric, DV-CLI is leading (but not lagging) cases over this period.}\label{fig:ccf-dv-finalized}
 \end{figure}
 
 \clearpage
@@ -224,7 +231,7 @@
 
 }
 
-\caption{Correlation of the difference in WIS with the  laggingness of the indicator at the target date, stratified by up, down, or flat period. Compare to Figure 5 in the manuscript.}\label{fig:lagging-only}
+\caption{Correlation of the difference in WIS with the laggingness of the indicator at the target date, stratified by up, down, or flat period. Compare to Figure 5 in the manuscript.}\label{fig:lagging-only}
 \end{figure}
 
 \clearpage
@@ -248,20 +255,11 @@
 
 }
 
-\caption{This figure shows the cumulative sum of WIS for each forecaster divided by the cumulative sum of WIS for the baseline model. The shaded background shows total U.S. cases as reported by JHU-CSSE for the 14-day ahead target. Hashes along the $x$-axis denote weeks.}\label{fig:cumulative-mean}
+\caption{Cumulative sum of WIS for each forecaster divided by the cumulative sum of WIS for the baseline model. The shaded background shows national case incidence for the 14-day ahead target. Hash marks along the x-axis denote weeks.}\label{fig:cumulative-mean}
 \end{figure}
 
 \clearpage
 
-\begin{figure}
-
-{\centering \includegraphics[width=\textwidth]{fig/cumulative-geo-mean-1} 
-
-}
-
-\caption{This figure shows the cumulative geometric mean of WIS for each forecaster divided by the WIS for the baseline model. The shaded background shows total U.S. cases as reported by JHU-CSSE for the 14-day ahead target. Hashes along the $x$-axis denote weeks.}\label{fig:cumulative-geo-mean}
-\end{figure}
-
 \clearpage
 
 \begin{figure}
@@ -270,13 +268,9 @@
 
 }
 
-\caption{Percent improvement in WIS relative to the AR forecaster by region (negative numbers indicate improved performance, positives indicate worsening).}\label{fig:errs-in-space}
+\caption{Percentage improvement in WIS, relative to the AR forecaster, by HRR (negative numbers indicate improved performance, positives indicate worsening).}\label{fig:errs-in-space}
 \end{figure}
 
-\clearpage
-
-\clearpage
-
 
 \FloatBarrier