diff --git a/src/game/metrics.py b/src/game/metrics.py index 0a5798b..b87a39a 100644 --- a/src/game/metrics.py +++ b/src/game/metrics.py @@ -24,6 +24,7 @@ import json import re import sys +from argparse import ArgumentParser from pathlib import Path from threading import Lock @@ -613,6 +614,40 @@ def _persist_overall_metrics(self) -> None: with path.open("w", encoding="utf-8") as fp: json.dump(payload, fp, ensure_ascii=False, indent=2) + # ------------------------------------------------------------------ # + # Historical aggregation helpers + # ------------------------------------------------------------------ # + + @classmethod + def aggregate_from_summaries( + cls, summaries: Iterable[Dict[str, Any]] + ) -> Dict[str, Any]: + """ + Produce aggregate metrics from previously persisted game summaries. + + Args: + summaries: Iterable of game summaries (as produced by _summarize_game). + + Returns: + Dict containing ``metrics`` and ``quality_score`` blocks matching the + structure of ``logs/metrics/overall.json``. + """ + instance = cls() + instance.reset() + + with instance._lock: + instance.completed_games = list(summaries) + win_counts = Counter() + for summary in instance.completed_games: + winner = summary.get("winner") + if winner: + win_counts[winner] += 1 + instance.win_counts = win_counts + + metrics = instance.get_overall_metrics() + quality_score = instance._compute_functional_score(metrics) + return {"metrics": metrics, "quality_score": quality_score} + # Global collector used by the rest of the codebase. metrics_collector = GameMetrics() @@ -718,8 +753,128 @@ def run_multilingual_metrics_batch( return {"metrics": overall_metrics, "quality_score": quality_score} +def load_saved_game_summaries( + metrics_dir: Optional[Path] = None, +) -> List[Dict[str, Any]]: + """ + Load all persisted game summaries from ``logs/metrics`` style directories. + + Args: + metrics_dir: Optional override directory. Defaults to the repository's + ``logs/metrics`` folder. + + Returns: + List of summary dictionaries extracted from ``.json`` files. + Files whose name starts with ``overall`` are ignored. + """ + metrics_dir = metrics_dir or (BASE_DIR / "logs" / "metrics") + if not metrics_dir.exists(): + return [] + + summaries: List[Dict[str, Any]] = [] + for path in sorted(metrics_dir.glob("*.json")): + if path.name.startswith("overall"): + continue + try: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + except (OSError, json.JSONDecodeError): + continue + summary = data.get("summary") + if isinstance(summary, dict): + summaries.append(summary) + + return summaries + + +def aggregate_saved_metrics( + *, + metrics_dir: Optional[Path] = None, + output_path: Optional[Path] = None, +) -> Dict[str, Any]: + """ + Aggregate metrics from previously saved game summary files. + + Args: + metrics_dir: Optional directory to search. Defaults to ``logs/metrics``. + output_path: Optional path to write the aggregated payload. When omitted, + the result is returned without writing to disk. + + Returns: + Aggregated payload containing ``metrics`` and ``quality_score`` blocks. + """ + summaries = load_saved_game_summaries(metrics_dir) + result = GameMetrics.aggregate_from_summaries(summaries) + + if output_path: + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8") as fp: + json.dump(result, fp, ensure_ascii=False, indent=2) + + return result + + +def _parse_args(): + parser = ArgumentParser( + description="Metrics utilities for LieGraph. Defaults to running a multilingual batch." + ) + subparsers = parser.add_subparsers(dest="command") + + batch_parser = subparsers.add_parser( + "batch", help="Run the built-in multilingual benchmark (default)." + ) + batch_parser.add_argument( + "--sequential", + dest="concurrent", + action="store_false", + help="Run games sequentially instead of concurrently.", + ) + batch_parser.set_defaults(concurrent=True) + + history_parser = subparsers.add_parser( + "history", + help="Aggregate metrics across previously persisted game summaries.", + ) + history_parser.add_argument( + "--metrics-dir", + type=str, + default=None, + help="Directory containing per-game summary JSON files (default: logs/metrics).", + ) + history_parser.add_argument( + "--output", + type=str, + default=None, + help="Optional path to write the aggregated payload (JSON).", + ) + + parser.set_defaults(command="batch", concurrent=True) + return parser.parse_args() + + +def main(): + args = _parse_args() + + if args.command == "history": + metrics_dir = Path(args.metrics_dir).expanduser() if args.metrics_dir else None + output_path = Path(args.output).expanduser() if args.output else None + result = aggregate_saved_metrics( + metrics_dir=metrics_dir, + output_path=output_path, + ) + print("\nAggregated historical metrics:") + print(json.dumps(result["metrics"], ensure_ascii=False, indent=2)) + print("\nQuality score:") + print(json.dumps(result["quality_score"], ensure_ascii=False, indent=2)) + return result + + # Default: run multilingual batch + result = run_multilingual_metrics_batch(concurrent=args.concurrent) + return result + + if __name__ == "__main__": - run_multilingual_metrics_batch(concurrent=True) + main() __all__ = [ @@ -727,4 +882,7 @@ def run_multilingual_metrics_batch( "metrics_collector", "MULTILINGUAL_VOCABULARY_BATCH", "run_multilingual_metrics_batch", + "load_saved_game_summaries", + "aggregate_saved_metrics", + "main", ] diff --git a/tests/test_metrics_history.py b/tests/test_metrics_history.py new file mode 100644 index 0000000..15a5e43 --- /dev/null +++ b/tests/test_metrics_history.py @@ -0,0 +1,87 @@ +import json + +from src.game.metrics import GameMetrics, load_saved_game_summaries + + +def test_load_saved_game_summaries_skips_overall(tmp_path): + summary = { + "summary": { + "game_id": "game-1", + "winner": "civilians", + "round_metrics": {}, + "self_accuracy_trend": None, + "suspicion_accuracy_trend": None, + "speech_diversity": { + "average_diversity": 0.0, + "average_unique_tokens": 0.0, + "average_total_tokens": 0.0, + "by_player": {}, + }, + } + } + (tmp_path / "game-1.json").write_text(json.dumps(summary), encoding="utf-8") + # This file should be ignored. + (tmp_path / "overall.json").write_text("{}", encoding="utf-8") + + loaded = load_saved_game_summaries(tmp_path) + assert len(loaded) == 1 + assert loaded[0]["game_id"] == "game-1" + + +def test_aggregate_from_summaries(): + summaries = [ + { + "game_id": "game-1", + "winner": "civilians", + "round_metrics": {1: {"self_accuracy": 0.8, "suspicion_accuracy": 0.6}}, + "self_accuracy_trend": 0.1, + "suspicion_accuracy_trend": 0.05, + "speech_diversity": { + "average_diversity": 0.4, + "average_unique_tokens": 3.0, + "average_total_tokens": 5.0, + "by_player": {}, + }, + }, + { + "game_id": "game-2", + "winner": "spies", + "round_metrics": { + 1: {"self_accuracy": 0.6, "suspicion_accuracy": 0.4}, + 2: {"self_accuracy": 0.7, "suspicion_accuracy": None}, + }, + "self_accuracy_trend": 0.0, + "suspicion_accuracy_trend": -0.05, + "speech_diversity": { + "average_diversity": 0.5, + "average_unique_tokens": 4.0, + "average_total_tokens": 6.0, + "by_player": {}, + }, + }, + ] + + result = GameMetrics.aggregate_from_summaries(summaries) + metrics = result["metrics"] + quality = result["quality_score"] + + assert metrics["games_played"] == 2 + assert metrics["win_rate"] == {"civilians": 0.5, "spies": 0.5} + assert metrics["win_balance_score"] == 1.0 + identification = metrics["identification"] + assert identification["average_self_accuracy"] == 0.7 + assert identification["average_suspicion_accuracy"] == 0.5 + assert identification["self_accuracy_trend"] == 0.05 + assert identification["suspicion_accuracy_trend"] == 0.0 + speech = metrics["speech_diversity"] + assert speech["average_diversity"] == 0.45 + assert speech["average_unique_tokens"] == 3.5 + assert speech["average_total_tokens"] == 5.5 + + from pytest import approx + + assert quality["overall_score"] == approx(0.7988, abs=1e-4) + assert quality["win_balance"] == approx(1.0, abs=1e-4) + assert quality["identification"] == approx(0.725, abs=1e-4) + assert quality["suspicion_trend"] == approx(0.5, abs=1e-4) + assert quality["speech_diversity"] == approx(0.45, abs=1e-4)