Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 159 additions & 1 deletion src/game/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import json
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from threading import Lock

Expand Down Expand Up @@ -613,6 +614,40 @@ def _persist_overall_metrics(self) -> None:
with path.open("w", encoding="utf-8") as fp:
json.dump(payload, fp, ensure_ascii=False, indent=2)

# ------------------------------------------------------------------ #
# Historical aggregation helpers
# ------------------------------------------------------------------ #

@classmethod
def aggregate_from_summaries(
cls, summaries: Iterable[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Produce aggregate metrics from previously persisted game summaries.

Args:
summaries: Iterable of game summaries (as produced by _summarize_game).

Returns:
Dict containing ``metrics`` and ``quality_score`` blocks matching the
structure of ``logs/metrics/overall.json``.
"""
instance = cls()
instance.reset()

with instance._lock:
instance.completed_games = list(summaries)
win_counts = Counter()
for summary in instance.completed_games:
winner = summary.get("winner")
if winner:
win_counts[winner] += 1
instance.win_counts = win_counts

metrics = instance.get_overall_metrics()
quality_score = instance._compute_functional_score(metrics)
return {"metrics": metrics, "quality_score": quality_score}


# Global collector used by the rest of the codebase.
metrics_collector = GameMetrics()
Expand Down Expand Up @@ -718,13 +753,136 @@ def run_multilingual_metrics_batch(
return {"metrics": overall_metrics, "quality_score": quality_score}


def load_saved_game_summaries(
metrics_dir: Optional[Path] = None,
) -> List[Dict[str, Any]]:
"""
Load all persisted game summaries from ``logs/metrics`` style directories.

Args:
metrics_dir: Optional override directory. Defaults to the repository's
``logs/metrics`` folder.

Returns:
List of summary dictionaries extracted from ``<game_id>.json`` files.
Files whose name starts with ``overall`` are ignored.
"""
metrics_dir = metrics_dir or (BASE_DIR / "logs" / "metrics")
if not metrics_dir.exists():
return []

summaries: List[Dict[str, Any]] = []
for path in sorted(metrics_dir.glob("*.json")):
if path.name.startswith("overall"):
continue
try:
with path.open("r", encoding="utf-8") as fp:
data = json.load(fp)
except (OSError, json.JSONDecodeError):
continue
summary = data.get("summary")
if isinstance(summary, dict):
summaries.append(summary)

return summaries


def aggregate_saved_metrics(
*,
metrics_dir: Optional[Path] = None,
output_path: Optional[Path] = None,
) -> Dict[str, Any]:
"""
Aggregate metrics from previously saved game summary files.

Args:
metrics_dir: Optional directory to search. Defaults to ``logs/metrics``.
output_path: Optional path to write the aggregated payload. When omitted,
the result is returned without writing to disk.

Returns:
Aggregated payload containing ``metrics`` and ``quality_score`` blocks.
"""
summaries = load_saved_game_summaries(metrics_dir)
result = GameMetrics.aggregate_from_summaries(summaries)

if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=2)

return result


def _parse_args():
parser = ArgumentParser(
description="Metrics utilities for LieGraph. Defaults to running a multilingual batch."
)
subparsers = parser.add_subparsers(dest="command")

batch_parser = subparsers.add_parser(
"batch", help="Run the built-in multilingual benchmark (default)."
)
batch_parser.add_argument(
"--sequential",
dest="concurrent",
action="store_false",
help="Run games sequentially instead of concurrently.",
)
batch_parser.set_defaults(concurrent=True)

history_parser = subparsers.add_parser(
"history",
help="Aggregate metrics across previously persisted game summaries.",
)
history_parser.add_argument(
"--metrics-dir",
type=str,
default=None,
help="Directory containing per-game summary JSON files (default: logs/metrics).",
)
history_parser.add_argument(
"--output",
type=str,
default=None,
help="Optional path to write the aggregated payload (JSON).",
)

parser.set_defaults(command="batch", concurrent=True)
return parser.parse_args()


def main():
args = _parse_args()

if args.command == "history":
metrics_dir = Path(args.metrics_dir).expanduser() if args.metrics_dir else None
output_path = Path(args.output).expanduser() if args.output else None
result = aggregate_saved_metrics(
metrics_dir=metrics_dir,
output_path=output_path,
)
print("\nAggregated historical metrics:")
print(json.dumps(result["metrics"], ensure_ascii=False, indent=2))
print("\nQuality score:")
print(json.dumps(result["quality_score"], ensure_ascii=False, indent=2))
return result

# Default: run multilingual batch
result = run_multilingual_metrics_batch(concurrent=args.concurrent)
return result


if __name__ == "__main__":
run_multilingual_metrics_batch(concurrent=True)
main()


__all__ = [
"GameMetrics",
"metrics_collector",
"MULTILINGUAL_VOCABULARY_BATCH",
"run_multilingual_metrics_batch",
"load_saved_game_summaries",
"aggregate_saved_metrics",
"main",
]
87 changes: 87 additions & 0 deletions tests/test_metrics_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json

from src.game.metrics import GameMetrics, load_saved_game_summaries


def test_load_saved_game_summaries_skips_overall(tmp_path):
summary = {
"summary": {
"game_id": "game-1",
"winner": "civilians",
"round_metrics": {},
"self_accuracy_trend": None,
"suspicion_accuracy_trend": None,
"speech_diversity": {
"average_diversity": 0.0,
"average_unique_tokens": 0.0,
"average_total_tokens": 0.0,
"by_player": {},
},
}
}
(tmp_path / "game-1.json").write_text(json.dumps(summary), encoding="utf-8")
# This file should be ignored.
(tmp_path / "overall.json").write_text("{}", encoding="utf-8")

loaded = load_saved_game_summaries(tmp_path)
assert len(loaded) == 1
assert loaded[0]["game_id"] == "game-1"


def test_aggregate_from_summaries():
summaries = [
{
"game_id": "game-1",
"winner": "civilians",
"round_metrics": {1: {"self_accuracy": 0.8, "suspicion_accuracy": 0.6}},
"self_accuracy_trend": 0.1,
"suspicion_accuracy_trend": 0.05,
"speech_diversity": {
"average_diversity": 0.4,
"average_unique_tokens": 3.0,
"average_total_tokens": 5.0,
"by_player": {},
},
},
{
"game_id": "game-2",
"winner": "spies",
"round_metrics": {
1: {"self_accuracy": 0.6, "suspicion_accuracy": 0.4},
2: {"self_accuracy": 0.7, "suspicion_accuracy": None},
},
"self_accuracy_trend": 0.0,
"suspicion_accuracy_trend": -0.05,
"speech_diversity": {
"average_diversity": 0.5,
"average_unique_tokens": 4.0,
"average_total_tokens": 6.0,
"by_player": {},
},
},
]

result = GameMetrics.aggregate_from_summaries(summaries)
metrics = result["metrics"]
quality = result["quality_score"]

assert metrics["games_played"] == 2
assert metrics["win_rate"] == {"civilians": 0.5, "spies": 0.5}
assert metrics["win_balance_score"] == 1.0
identification = metrics["identification"]
assert identification["average_self_accuracy"] == 0.7
assert identification["average_suspicion_accuracy"] == 0.5
assert identification["self_accuracy_trend"] == 0.05
assert identification["suspicion_accuracy_trend"] == 0.0
speech = metrics["speech_diversity"]
assert speech["average_diversity"] == 0.45
assert speech["average_unique_tokens"] == 3.5
assert speech["average_total_tokens"] == 5.5

from pytest import approx

assert quality["overall_score"] == approx(0.7988, abs=1e-4)
assert quality["win_balance"] == approx(1.0, abs=1e-4)
assert quality["identification"] == approx(0.725, abs=1e-4)
assert quality["suspicion_trend"] == approx(0.5, abs=1e-4)
assert quality["speech_diversity"] == approx(0.45, abs=1e-4)
Loading