p-e-w · RyderFreeman4Logos · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -15,8 +15,12 @@ wheels/
 # Editors
 /.vscode/
 
-# Configuration files
+# Configuration files (may contain API keys)
 /config.toml
+/judge.toml
+
+# Environment variables
+.env
 
 # Study checkpoints
 /checkpoints/

diff --git a/judge.default.toml b/judge.default.toml
@@ -0,0 +1,25 @@
+# LLM judge configuration (hot-reloadable — changes take effect without restart).
+#
+# Copy to judge.toml and edit.  Environment variables override file values.
+#
+# Env var mapping:
+#   LLM_JUDGE_API_BASE, LLM_JUDGE_API_KEY, LLM_JUDGE_MODELS (comma-separated),
+#   LLM_JUDGE_BATCH_SIZE, LLM_JUDGE_CONCURRENCY, LLM_JUDGE_TIMEOUT,
+#   LLM_JUDGE_MAX_RETRIES, LLM_JUDGE_PRICING (model:in:out,...)
+#
+# Config file path can be changed via LLM_JUDGE_CONFIG env var (default: judge.toml).
+
+api_base = "http://localhost:8317/v1/chat/completions"
+# api_key = ""   # prefer LLM_JUDGE_API_KEY env var
-# api_key = ""   # prefer LLM_JUDGE_API_KEY env var
+# api_key = ""   # Prefer LLM_JUDGE_API_KEY env var.
-# api_key = ""   # prefer LLM_JUDGE_API_KEY env var
+# api_key = ""   # Prefer LLM_JUDGE_API_KEY env var.
+
+models = ["gpt-mini", "spark", "gemini-flash"]
+
+batch_size  = 10    # Items per API call.
+concurrency = 6     # Parallel batch workers.
+timeout     = 90    # Seconds per HTTP request.
+max_retries = 3     # Retries per model before fallback.
+
+[pricing]   # USD per 1M tokens: [input, output]
+gpt-mini      = [0.15, 0.60]
+spark         = [0.50, 2.00]
+gemini-flash  = [0.15, 0.60]
diff --git a/lefthook.yml b/lefthook.yml
@@ -0,0 +1,10 @@
+pre-commit:
+  commands:
+    fmt:
+      run: mise run fmt
+    lint:
+      run: mise run lint
+    typecheck:
+      run: mise run typecheck
+    build:
+      run: mise run build
diff --git a/mise.toml b/mise.toml
@@ -0,0 +1,31 @@
+[tools]
+uv = "latest"
+lefthook = "latest"
+
+[tasks.fmt]
+description = "Check code formatting"
+run = "uv run ruff format --check ."
+
+[tasks."fmt:fix"]
+description = "Apply code formatting"
+run = "uv run ruff format ."
+
+[tasks.lint]
+description = "Lint and check import sorting"
+run = "uv run ruff check --extend-select I ."
+
+[tasks."lint:fix"]
+description = "Lint and auto-fix"
+run = "uv run ruff check --extend-select I --fix ."
+
+[tasks.typecheck]
+description = "Type check with ty"
+run = "uv run ty check --error-on-warning ."
+
+[tasks.build]
+description = "Build package"
+run = "uv build"
+
+[tasks.check]
+description = "Run all quality gates (CI equivalent)"
+depends = ["fmt", "lint", "typecheck", "build"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,10 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+llm-judge = [
+    "httpx>=0.27",
+    "tomli>=2; python_version < '3.11'",
+]
 research = [
     "geom-median~=0.1",
     "imageio~=2.37",
@@ -52,6 +56,7 @@ research = [
 
 [dependency-groups]
 dev = [
+    "pytest>=9.0.2",
     "ruff>=0.14.5",
     "ty>=0.0.5",
 ]

diff --git a/src/heretic/config.py b/src/heretic/config.py
@@ -341,6 +341,11 @@ class Settings(BaseSettings):
         description="Strings whose presence in a response (case insensitive) identifies the response as a refusal.",
     )
 
+    use_llm_judge: bool = Field(
+        default=False,
+        description="Use LLM judge for refusal classification instead of substring matching.",
+    )
+
     system_prompt: str = Field(
         default="You are a helpful assistant.",
         description="System prompt to use when prompting the model.",