forecastingresearch · houtanb · May 13, 2026 · elsehow · Jun 3, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,138 @@
+# Repository Instructions
+
+## Shared LLM Registry
+
+This package targets Python 3.14. Black is configured with
+`target-version = ["py314"]`; do not broaden `requires-python` without first
+checking that formatted code remains valid for the older target.
+
+## Local Development Setup
+
+Use Python 3.14 for local development:
+
+```bash
+python3.14 -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install -r requirements.txt
+```
+
+`requirements.txt` delegates to `.[dev]`; it installs this package and the dev
+tools from `pyproject.toml` without editable mode.
+
+When another repo needs local utils changes during development, use that repo's
+virtual environment and install utils explicitly in editable mode, for example:
+
+```bash
+python -m pip install -e ../utils
+```
+
+Do not add local relative paths to another repo's requirements files. Those
+files should use the deployed git pin when ready to deploy.
+
+The shared LLM registry has two layers:
+
+- `utils.llm.model_registry.MODELS` contains canonical provider-callable base models.
+- `utils.llm.model_runs.MODEL_RUNS` contains exact benchmarkable model-plus-options runs.
+
+Benchmarks should choose from `MODEL_RUNS` by `model_run_key`; forecast files should store that exact key.
+
+When adding a base model:
+
+- Add provider/lab registry entries first only if the provider or lab is missing.
+- Look up the model in Models.dev. Prefer a `ModelsDevReference` when Models.dev
+  has the provider/model entry.
+- In Models.dev source paths, `provider_id` is the folder under `providers/`,
+  and `model_id` is the TOML filename stem under `models/`, for example
+  `providers/anthropic/models/claude-opus-4-8.toml` maps to `anthropic` /
+  `claude-opus-4-8`.
+- The checked-in Models.dev snapshot is not a catalog; it contains only
+  registry-referenced models and only `id`, `name`, and `release_date`.
+- Use exact Models.dev `provider_id`/`model_id` values. If a reference is wrong,
+  refreshing the snapshot should fail and suggest nearby Models.dev entries.
+- Use `manual_release_date` when the model is missing from Models.dev, when the
+  Models.dev entry lacks a usable full release date, or for deliberate
+  historical/manual entries.
+- Put the model in the provider-specific list in `utils/llm/model_registry.py` (`OPENAI_MODELS`, `TOGETHER_MODELS`, `ANTHROPIC_MODELS`, `XAI_MODELS`, or `GOOGLE_MODELS`).
+- Insert the model where `(release_date, model_key)` stays ascending within its
+  provider-specific list.
+- Use `provider_model_id` for the exact string sent to the provider API. It may differ from `model_key`, especially for routed providers like Together.
+- Set `active=False` only when a provider route should remain in registry history
+  but should be excluded from current live-callable benchmark runs.
+- Do not add duplicate `model_key`s. `MODELS = create_models_list(...)` validates uniqueness.
+
+After changing `ModelsDevReference` values, refresh the Models.dev snapshot from the utils repo:
+```bash
+python - <<'PY'
+from scripts.refresh_models_dev_metadata import write_models_dev_snapshot
+
+write_models_dev_snapshot()
+PY
+```
+
+When adding a model run:
+
+- Add it to `utils/llm/model_runs.py` with
+  `_model_run(model_run_key=..., model_key=..., options=...)`.
+- Write `model_run_key` explicitly as the stable benchmark identifier. Do not
+  rely on implicit generation from model/options.
+- Put every runtime call option in the `ModelRun` declaration; do not add hidden defaults elsewhere.
+- Use exact provider option names and values as they are passed to `get_response`.
+- If an option affects performance and should appear in filenames/forecast keys, add or update a naming rule in `NAME_COMPONENT_RULES`.
+- If an option is intentionally name-neutral, add it to `NAME_NEUTRAL_OPTION_PATHS`.
+- Unknown option paths should fail loudly rather than silently producing ambiguous model-run keys.
+- `build_model_run_key(...)` is a suggested-key helper for consistency checks and
+  new naming rules; the declared `model_run_key` remains the durable identity.
+- Do not add duplicate `model_run_key`s. `MODEL_RUNS = create_model_runs_list(...)` validates uniqueness.
+- `MODEL_RUNS` is the historical registry. `ACTIVE_MODEL_RUNS` is derived from
+  it by dropping runs whose base `Model` has `active=False`.
+- Add unit tests for new naming behavior, registry inclusion, and routed provider options when relevant.
+
+## Artificial Analysis Model Runs
+
+When adding an Artificial Analysis-backed model run:
+
+- Use the checked-in Artificial Analysis snapshot as the source for the stable AA model ID and displayed AA name.
+- Refresh the snapshot from the AA endpoint; do not hand-edit individual AA models into the JSON file.
+- The official AA API key is `API_KEY_ARTIFICIAL_ANALYSIS` in GCP Secret Manager.
+- Do not hard-code an AA display name in a `ModelRun`; set `artificial_analysis_id` and let the run read the display name from the snapshot.
+- Do not add an `artificial_analysis_model` flag. A non-null `artificial_analysis_id` is the marker that a run is AA-backed.
+- Add or update the canonical base `Model` only if the provider-callable model is missing from `utils.llm.model_registry`.
+- Add the callable model-plus-options declaration to
+  `ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS` in
+  `utils/llm/artificial_analysis_model_runs.py`. Every declaration there is
+  automatically included in `utils.llm.model_runs.MODEL_RUNS`; do not add the
+  same AA run manually to `MODEL_RUNS`.
+- Use the exact provider option names that are passed at runtime. Token suffixes in model-run keys must reflect the actual token cap option used for the call.
+
+Artificial Analysis token caps should be encoded in the run options this way:
+
+- Non-reasoning models: use `16_384` output tokens, adjusted downward if the model has a smaller context window or a lower maximum output-token cap.
+- Reasoning models: use the maximum output tokens allowed by the model creator for that reasoning configuration.
+- If the correct cap is not clear from provider/model documentation or the AA metadata, stop and confirm rather than guessing.
+
+After adding an AA model run:
+
+- Add or update unit tests that prove the AA ID resolves from the snapshot and that `display_name` matches the AA leaderboard name.
+- Add or update shared registry coverage tests for the new selectable model-run key.
+- Run the focused model-run and AA metadata tests, then run the full lint/test suite before committing.
+
+## Validation
+
+- Run `make lint` before committing. It runs `isort .`, `black .`, `flake8 .`,
+  and `pydocstyle .`.
+- Run `make test` before committing code changes. Use `PYTEST_ARGS=...` for a
+  focused test pass while iterating.
+- Run `make test-integration` or `make test-integration-parallel` only when the
+  relevant provider/GCP credentials are available.
+
+## Live Model-Run Smoke Tests
+
+Integration tests that hit real LLM APIs require provider API keys.
+
+- `tests/conftest.py` loads `.env`, then `configure_api_keys(from_gcp=True)` when pytest is run with `--integration`.
+- `configure_api_keys(from_gcp=True)` reads provider keys from GCP Secret Manager using the secret names in `utils/helpers/constants.py`.
+- The standard LLM secret names are `API_KEY_OPENAI`, `API_KEY_ANTHROPIC`, `API_KEY_GEMINI`, `API_KEY_XAI`, and `API_KEY_TOGETHERAI`.
+- To test a specific shared model run, set `LLM_MODEL_RUN_KEYS` to one or more comma-separated `model_run_key`s and run `pytest --integration tests/integration/llm/test_model_runs.py`.
+- The model-run integration test calls `model_run.get_response`, so it uses the run's declared provider route, provider model ID, and options.
+- For a newly added model run, prefer running its exact smoke test before assuming the provider accepts the declared options.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+PYTEST_ARGS ?=
+
 lint: pyproject.toml setup.cfg
 	isort .
 	black .
@@ -8,13 +10,13 @@ clean:
 	find . -type f -name "*~" -exec rm -f {} +
 
 test:
-	pytest
+	pytest $(PYTEST_ARGS)
 
 test-integration:
-	pytest --integration
+	pytest --integration $(PYTEST_ARGS)
 
 test-integration-parallel:
-	pytest --integration -n auto
+	pytest --integration -n auto $(PYTEST_ARGS)
 
 coverage:
-	pytest --cov=utils --cov-report=term-missing --cov-report=html
+	pytest --cov=utils --cov-report=term-missing --cov-report=html $(PYTEST_ARGS)
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ uv add fri-utils
 
 
 ```
-from utils.llm.model_registry import configure_api_keys, MODELS
+from utils.llm.model_registry import configure_api_keys, MODELS_BY_KEY
 
 # Input the API key for any model provider you like!
 configure_api_keys(
@@ -40,7 +40,7 @@ configure_api_keys(
 
 # Call any model we support!
 # See the full list of supported models in `utils/llm/model_registry.py`
-model = next(m for m in MODELS if m.id == "gemini-2.5-flash")
+model = MODELS_BY_KEY["gemini-2.5-pro"]
 model.get_response("Hello")
 # > "Hello! How can I help you?"
 ```
@@ -62,6 +62,12 @@ Use option names supported by the respective provider (`utils/llm/providers`).
 
 If you don’t see an option you need, feel free to open a GitHub issue!
 
+### Third-party metadata
+
+The shared LLM registry includes normalized metadata from Models.dev and
+Artificial Analysis. See `THIRD_PARTY_NOTICES.md` for Models.dev license terms
+and Artificial Analysis attribution.
+
 
 ### Configuring keys from GCP Secret Manager
 
@@ -71,7 +77,7 @@ If so, you can use the `from_gcp=True` shortcut to set your keys for all model p
 
 ```
 configure_api_keys(from_gcp=True) # Configure all provider keys from GCP.
-model = next(m for m in MODELS if m.id == "gpt-4.1-mini")
+model = MODELS_BY_KEY["gpt-5-mini-2025-08-07"]
 response = model.get_response("Hello")
 ```
 
@@ -82,6 +88,7 @@ If you're setting up a Google Cloud Project, the API keys must be stored in Secr
 - `API_KEY_OPENAI` for OpenAI
 - `API_KEY_XAI` for xAI
 - `API_KEY_TOGETHERAI` for Together AI
+- `API_KEY_ARTIFICIAL_ANALYSIS` for refreshing the Artificial Analysis metadata snapshot
 
 You can also check `utils/helpers/constants.py` for the complete list of secret names.
 

diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md
@@ -0,0 +1,42 @@
+# Third-Party Notices
+
+This repository includes normalized metadata derived from third-party sources.
+
+## Models.dev
+
+The checked-in Models.dev snapshot is derived from https://models.dev/api.json
+and the upstream repository https://github.com/anomalyco/models.dev
+
+Models.dev is licensed under the MIT License:
+
+```text
+MIT License
+
+Copyright (c) 2025 models.dev
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+```
+
+## Artificial Analysis
+
+The checked-in Artificial Analysis snapshot is derived from the Artificial
+Analysis free API and is minimized to the stable model IDs and display names
+used by this package.
+
+Attribution: Artificial Analysis, https://artificialanalysis.ai/.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,31 +4,32 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fri-utils"
-version = "0.1.0"
+version = "0.2.0"
 description = "Utilities for the Forecasting Research Institute codebase."
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.14"
 license = { file = "LICENSE" }
 authors = [{ name = "Forecasting Research Institute" }]
 dependencies = [
-    "google-genai==1.73.1",
-    "anthropic==0.97.0",
-    "together==2.11.0",
-    "openai==2.33.0",
+    "google-genai==2.7.0",
+    "anthropic==0.105.2",
+    "together==2.16.0",
+    "openai==2.40.0",
     "google-cloud-secret-manager>=2.20.0",
     "google-cloud-storage>=2.14.0",
     "python-dotenv>=1.0.0",
 ]
 
 [project.optional-dependencies]
 dev = [
-    "black",
-    "flake8",
-    "flake8-bugbear",
-    "isort",
-    "pydocstyle",
-    "pytest",
-    "pytest-cov",
+    "black==26.5.1",
+    "flake8==7.3.0",
+    "flake8-bugbear==25.11.29",
+    "isort==8.0.1",
+    "pydocstyle==6.3.0",
+    "pytest==9.0.3",
+    "pytest-cov==7.1.0",
+    "pytest-xdist==3.8.0",
 ]
 
 [tool.setuptools.packages.find]
@@ -38,8 +39,15 @@ include = [
 ]
 exclude = ["tests*", "htmlcov*", "venv*"]
 
+[tool.setuptools]
+license-files = ["LICENSE", "THIRD_PARTY_NOTICES.md"]
+
+[tool.setuptools.package-data]
+"utils.llm.metadata" = ["*.json"]
+
 [tool.black]
 line-length = 100
+target-version = ["py314"]
 
 [tool.pytest.ini_options]
 markers = [

diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1 @@
-google-genai==1.73.1
-anthropic==0.97.0
-together==2.11.0
-openai==2.33.0
-google-cloud-secret-manager>=2.20.0
-google-cloud-storage>=2.14.0
-python-dotenv>=1.0.0
-isort
-black
-flake8
-flake8-bugbear
-pydocstyle
-pytest
-pytest-cov
-pytest-xdist
+.[dev]