Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions api/routes/eval_progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""GET /api/eval-progress -- live eval round progress."""

import time

from fastapi import APIRouter
from fastapi.responses import JSONResponse

from api.config import STATE_DIR
from api.helpers.state_reader import safe_json_load, sanitize_floats

router = APIRouter()

_STALE_THRESHOLD_S = 1800 # 30 minutes


@router.get(
"/api/eval-progress",
tags=["Overview"],
summary="Live eval progress",
description=(
"Returns the current eval round progress, including phase, "
"per-challenger status, GPU info, and a timestamped step timeline. "
'Returns {"status": "idle"} when no eval is running.'
),
)
def eval_progress():
data = safe_json_load(STATE_DIR / "eval_progress.json")
if data is None:
return JSONResponse(
content={"status": "idle"},
headers={"Cache-Control": "public, max-age=5"},
)
updated = data.get("updated_at", 0)
if data.get("status") == "running" and time.time() - updated > _STALE_THRESHOLD_S:
data["possibly_stale"] = True
return JSONResponse(
content=sanitize_floats(data),
headers={"Cache-Control": "public, max-age=5"},
)
2 changes: 2 additions & 0 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from api.routes.status import router as status_router
from api.routes.king import router as king_router
from api.routes.evaluations import router as evaluations_router
from api.routes.eval_progress import router as eval_progress_router
from api.routes.logs import router as logs_router
from api.routes.rounds import router as rounds_router

Expand Down Expand Up @@ -66,6 +67,7 @@ def _client_ip(request: Request) -> str:
app.include_router(status_router)
app.include_router(king_router)
app.include_router(evaluations_router)
app.include_router(eval_progress_router)
app.include_router(logs_router)
app.include_router(rounds_router)

Expand Down
135 changes: 134 additions & 1 deletion tests/test_cpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
from validator.chain import CommitmentRecord
from validator.cpu_validator import (
WEIGHTS_REFRESH_BLOCKS,
_clean_stale_eval_job,
_needs_weight_set,
run_tick,
)
from validator.eval_schema import EVAL_JOB_FILE, EvalJob
from validator.eval_progress import purge_old_logs
from validator.eval_schema import EVAL_JOB_FILE, ChallengerInfo, EvalJob
from validator.state import EvaluationRecord, ValidatorState

pytestmark = pytest.mark.unit
Expand Down Expand Up @@ -457,3 +459,134 @@ def test_continues_on_s3_download_failure(self, _mock_dl, tmp_path):
)

assert summary["block"] == 1000


# --------------------------------------------------------------------------- #
# _clean_stale_eval_job
# --------------------------------------------------------------------------- #


class TestCleanStaleEvalJob:
def test_removes_when_all_challengers_known(self, tmp_path):
state = ValidatorState()
rec = _make_eval_record(1, "hk1", 200, 0.5, eval_block=500)
state.record_evaluation(rec, current_block=500)

job = EvalJob(
block=1000,
block_hash="0xabc",
challengers=[
ChallengerInfo(
uid=1,
hotkey="hk1",
commit_block=200,
image="u/r:v1",
digest=_DIGEST,
),
],
created_at=1700000000.0,
)
job.save(str(tmp_path))
assert (tmp_path / EVAL_JOB_FILE).exists()

removed = _clean_stale_eval_job(state, str(tmp_path))
assert removed is True
assert not (tmp_path / EVAL_JOB_FILE).exists()

def test_keeps_when_challenger_unknown(self, tmp_path):
state = ValidatorState()

job = EvalJob(
block=1000,
block_hash="0xabc",
challengers=[
ChallengerInfo(
uid=1,
hotkey="hk1",
commit_block=200,
image="u/r:v1",
digest=_DIGEST,
),
],
created_at=1700000000.0,
)
job.save(str(tmp_path))

removed = _clean_stale_eval_job(state, str(tmp_path))
assert removed is False
assert (tmp_path / EVAL_JOB_FILE).exists()

def test_noop_when_no_file(self, tmp_path):
state = ValidatorState()
assert _clean_stale_eval_job(state, str(tmp_path)) is False

def test_mixed_known_and_unknown(self, tmp_path):
state = ValidatorState()
rec = _make_eval_record(1, "hk1", 200, 0.5, eval_block=500)
state.record_evaluation(rec, current_block=500)

job = EvalJob(
block=1000,
block_hash="0xabc",
challengers=[
ChallengerInfo(
uid=1,
hotkey="hk1",
commit_block=200,
image="u/r:v1",
digest=_DIGEST,
),
ChallengerInfo(
uid=2,
hotkey="hk2",
commit_block=300,
image="u/r:v2",
digest="sha256:" + "b" * 64,
),
],
created_at=1700000000.0,
)
job.save(str(tmp_path))

removed = _clean_stale_eval_job(state, str(tmp_path))
assert removed is False
assert (tmp_path / EVAL_JOB_FILE).exists()


# --------------------------------------------------------------------------- #
# purge_old_logs
# --------------------------------------------------------------------------- #


class TestPurgeOldLogs:
def test_removes_old_logs(self, tmp_path):
logs = tmp_path / "logs"
logs.mkdir()
(logs / "cpu_validator_20260101_120000.log").write_text("old")
(logs / "cpu_validator_20260516_120000.log").write_text("recent")
(logs / "gpu_eval_20260102_090000.log").write_text("old gpu")
(logs / "random_file.txt").write_text("ignore me")

with patch("validator.config.LOG_RETENTION_DAYS", 10):
removed = purge_old_logs(str(tmp_path))

assert removed == 2
assert not (logs / "cpu_validator_20260101_120000.log").exists()
assert not (logs / "gpu_eval_20260102_090000.log").exists()
assert (logs / "cpu_validator_20260516_120000.log").exists()
assert (logs / "random_file.txt").exists()

def test_noop_when_disabled(self, tmp_path):
logs = tmp_path / "logs"
logs.mkdir()
(logs / "cpu_validator_20200101_120000.log").write_text("old")

with patch("validator.config.LOG_RETENTION_DAYS", 0):
removed = purge_old_logs(str(tmp_path))

assert removed == 0
assert (logs / "cpu_validator_20200101_120000.log").exists()

def test_noop_when_no_logs_dir(self, tmp_path):
removed = purge_old_logs(str(tmp_path))
assert removed == 0
Loading
Loading