latent-to · xavierlyu · May 17, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
@@ -0,0 +1,39 @@
+"""GET /api/eval-progress -- live eval round progress."""
+
+import time
+
+from fastapi import APIRouter
+from fastapi.responses import JSONResponse
+
+from api.config import STATE_DIR
+from api.helpers.state_reader import safe_json_load, sanitize_floats
+
+router = APIRouter()
+
+_STALE_THRESHOLD_S = 1800  # 30 minutes
+
+
+@router.get(
+    "/api/eval-progress",
+    tags=["Overview"],
+    summary="Live eval progress",
+    description=(
+        "Returns the current eval round progress, including phase, "
+        "per-challenger status, GPU info, and a timestamped step timeline. "
+        'Returns {"status": "idle"} when no eval is running.'
+    ),
+)
+def eval_progress():
+    data = safe_json_load(STATE_DIR / "eval_progress.json")
+    if data is None:
+        return JSONResponse(
+            content={"status": "idle"},
+            headers={"Cache-Control": "public, max-age=5"},
+        )
+    updated = data.get("updated_at", 0)
+    if data.get("status") == "running" and time.time() - updated > _STALE_THRESHOLD_S:
+        data["possibly_stale"] = True
+    return JSONResponse(
+        content=sanitize_floats(data),
+        headers={"Cache-Control": "public, max-age=5"},
+    )
@@ -12,6 +12,7 @@
 from api.routes.status import router as status_router
 from api.routes.king import router as king_router
 from api.routes.evaluations import router as evaluations_router
+from api.routes.eval_progress import router as eval_progress_router
 from api.routes.logs import router as logs_router
 from api.routes.rounds import router as rounds_router
 
@@ -66,6 +67,7 @@ def _client_ip(request: Request) -> str:
 app.include_router(status_router)
 app.include_router(king_router)
 app.include_router(evaluations_router)
+app.include_router(eval_progress_router)
 app.include_router(logs_router)
 app.include_router(rounds_router)
 

@@ -12,10 +12,12 @@
 from validator.chain import CommitmentRecord
 from validator.cpu_validator import (
     WEIGHTS_REFRESH_BLOCKS,
+    _clean_stale_eval_job,
     _needs_weight_set,
     run_tick,
 )
-from validator.eval_schema import EVAL_JOB_FILE, EvalJob
+from validator.eval_progress import purge_old_logs
+from validator.eval_schema import EVAL_JOB_FILE, ChallengerInfo, EvalJob
 from validator.state import EvaluationRecord, ValidatorState
 
 pytestmark = pytest.mark.unit
@@ -457,3 +459,134 @@ def test_continues_on_s3_download_failure(self, _mock_dl, tmp_path):
         )
 
         assert summary["block"] == 1000
+
+
+# --------------------------------------------------------------------------- #
+# _clean_stale_eval_job
+# --------------------------------------------------------------------------- #
+
+
+class TestCleanStaleEvalJob:
+    def test_removes_when_all_challengers_known(self, tmp_path):
+        state = ValidatorState()
+        rec = _make_eval_record(1, "hk1", 200, 0.5, eval_block=500)
+        state.record_evaluation(rec, current_block=500)
+
+        job = EvalJob(
+            block=1000,
+            block_hash="0xabc",
+            challengers=[
+                ChallengerInfo(
+                    uid=1,
+                    hotkey="hk1",
+                    commit_block=200,
+                    image="u/r:v1",
+                    digest=_DIGEST,
+                ),
+            ],
+            created_at=1700000000.0,
+        )
+        job.save(str(tmp_path))
+        assert (tmp_path / EVAL_JOB_FILE).exists()
+
+        removed = _clean_stale_eval_job(state, str(tmp_path))
+        assert removed is True
+        assert not (tmp_path / EVAL_JOB_FILE).exists()
+
+    def test_keeps_when_challenger_unknown(self, tmp_path):
+        state = ValidatorState()
+
+        job = EvalJob(
+            block=1000,
+            block_hash="0xabc",
+            challengers=[
+                ChallengerInfo(
+                    uid=1,
+                    hotkey="hk1",
+                    commit_block=200,
+                    image="u/r:v1",
+                    digest=_DIGEST,
+                ),
+            ],
+            created_at=1700000000.0,
+        )
+        job.save(str(tmp_path))
+
+        removed = _clean_stale_eval_job(state, str(tmp_path))
+        assert removed is False
+        assert (tmp_path / EVAL_JOB_FILE).exists()
+
+    def test_noop_when_no_file(self, tmp_path):
+        state = ValidatorState()
+        assert _clean_stale_eval_job(state, str(tmp_path)) is False
+
+    def test_mixed_known_and_unknown(self, tmp_path):
+        state = ValidatorState()
+        rec = _make_eval_record(1, "hk1", 200, 0.5, eval_block=500)
+        state.record_evaluation(rec, current_block=500)
+
+        job = EvalJob(
+            block=1000,
+            block_hash="0xabc",
+            challengers=[
+                ChallengerInfo(
+                    uid=1,
+                    hotkey="hk1",
+                    commit_block=200,
+                    image="u/r:v1",
+                    digest=_DIGEST,
+                ),
+                ChallengerInfo(
+                    uid=2,
+                    hotkey="hk2",
+                    commit_block=300,
+                    image="u/r:v2",
+                    digest="sha256:" + "b" * 64,
+                ),
+            ],
+            created_at=1700000000.0,
+        )
+        job.save(str(tmp_path))
+
+        removed = _clean_stale_eval_job(state, str(tmp_path))
+        assert removed is False
+        assert (tmp_path / EVAL_JOB_FILE).exists()
+
+
+# --------------------------------------------------------------------------- #
+# purge_old_logs
+# --------------------------------------------------------------------------- #
+
+
+class TestPurgeOldLogs:
+    def test_removes_old_logs(self, tmp_path):
+        logs = tmp_path / "logs"
+        logs.mkdir()
+        (logs / "cpu_validator_20260101_120000.log").write_text("old")
+        (logs / "cpu_validator_20260516_120000.log").write_text("recent")
+        (logs / "gpu_eval_20260102_090000.log").write_text("old gpu")
+        (logs / "random_file.txt").write_text("ignore me")
+
+        with patch("validator.config.LOG_RETENTION_DAYS", 10):
+            removed = purge_old_logs(str(tmp_path))
+
+        assert removed == 2
+        assert not (logs / "cpu_validator_20260101_120000.log").exists()
+        assert not (logs / "gpu_eval_20260102_090000.log").exists()
+        assert (logs / "cpu_validator_20260516_120000.log").exists()
+        assert (logs / "random_file.txt").exists()
+
+    def test_noop_when_disabled(self, tmp_path):
+        logs = tmp_path / "logs"
+        logs.mkdir()
+        (logs / "cpu_validator_20200101_120000.log").write_text("old")
+
+        with patch("validator.config.LOG_RETENTION_DAYS", 0):
+            removed = purge_old_logs(str(tmp_path))
+
+        assert removed == 0
+        assert (logs / "cpu_validator_20200101_120000.log").exists()
+
+    def test_noop_when_no_logs_dir(self, tmp_path):
+        removed = purge_old_logs(str(tmp_path))
+        assert removed == 0