-
Notifications
You must be signed in to change notification settings - Fork 0
GRA-216: Add graduation threshold experiment flag shim #188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
6e286db
9f61abb
8206929
3fc53c6
0e03bd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| # Weekly Correction Snapshot | ||
|
|
||
| `scripts/weekly_correction_snapshot.py` builds a deterministic JSON summary from newline-delimited JSON (NDJSON) events. This is intended for weekly correction-outcome trend reporting. | ||
|
|
||
| ## Usage | ||
|
|
||
| From file: | ||
|
|
||
| ```bash | ||
| python scripts/weekly_correction_snapshot.py --input /path/to/events.jsonl | ||
| ``` | ||
|
|
||
| From stdin: | ||
|
|
||
| ```bash | ||
| cat /path/to/events.jsonl | python scripts/weekly_correction_snapshot.py | ||
| ``` | ||
|
|
||
| ## Output schema | ||
|
|
||
| The script always emits one compact JSON object with stable key ordering: | ||
|
|
||
| - `total_corrections` (int): count of correction events (`event=correction.created` or `kind=correction`) | ||
| - `accepted_graduations` (int): count of accepted graduation outcomes | ||
| - `rejection_count` (int): count of rejected graduation outcomes | ||
| - `acceptance_rate` (float): `accepted_graduations / (accepted_graduations + rejection_count)`, rounded to 6 decimals, or `0.0` if denominator is zero | ||
| - `top_rule_categories` (list): up to 5 entries sorted by descending count, then category name | ||
| - `skipped_rows` (int): malformed or non-object rows ignored during parsing | ||
|
|
||
| `top_rule_categories` entries use: | ||
|
|
||
| ```json | ||
| {"category":"tone","count":12} | ||
| ``` | ||
|
|
||
| Category normalization is lowercase + trimmed whitespace. Empty/missing categories normalize to `"unknown"`. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,130 @@ | ||
| #!/usr/bin/env python3 | ||
| """Compute weekly correction/graduation aggregates from NDJSON events.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import json | ||
| import sys | ||
| from collections import Counter | ||
| from typing import Any | ||
|
|
||
|
|
||
| def _normalize_category(value: Any) -> str: | ||
| if value is None: | ||
| return "unknown" | ||
| normalized = str(value).strip().lower() | ||
| return normalized or "unknown" | ||
|
|
||
|
|
||
| def _is_correction(row: dict[str, Any]) -> bool: | ||
| event = str(row.get("event", "")).strip().lower() | ||
| kind = str(row.get("kind", "")).strip().lower() | ||
| return event == "correction.created" or kind == "correction" | ||
|
|
||
|
|
||
| def _is_graduation_accepted(row: dict[str, Any]) -> bool: | ||
| event = str(row.get("event", "")).strip().lower() | ||
| outcome = str(row.get("outcome", "")).strip().lower() | ||
| accepted_flag = row.get("accepted") | ||
| status = str(row.get("status", "")).strip().lower() | ||
| return ( | ||
| event in {"lesson.graduated", "graduation.accepted"} | ||
| or outcome == "accepted" | ||
| or accepted_flag is True | ||
| or status in {"accepted", "graduated"} | ||
| ) | ||
|
|
||
|
|
||
| def _is_rejection(row: dict[str, Any]) -> bool: | ||
| event = str(row.get("event", "")).strip().lower() | ||
| outcome = str(row.get("outcome", "")).strip().lower() | ||
| accepted_flag = row.get("accepted") | ||
| status = str(row.get("status", "")).strip().lower() | ||
| return ( | ||
| event in {"graduation.rejected", "lesson.rejected"} | ||
| or outcome == "rejected" | ||
| or accepted_flag is False | ||
| or status == "rejected" | ||
| ) | ||
|
|
||
|
|
||
| def parse_rows(lines: list[str]) -> tuple[list[dict[str, Any]], int]: | ||
| rows: list[dict[str, Any]] = [] | ||
| skipped = 0 | ||
| for raw in lines: | ||
| line = raw.strip() | ||
| if not line: | ||
| continue | ||
| try: | ||
| row = json.loads(line) | ||
| except json.JSONDecodeError: | ||
| skipped += 1 | ||
| continue | ||
| if not isinstance(row, dict): | ||
| skipped += 1 | ||
| continue | ||
| rows.append(row) | ||
| return rows, skipped | ||
|
|
||
|
|
||
| def aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]: | ||
| total_corrections = 0 | ||
| accepted_graduations = 0 | ||
| rejection_count = 0 | ||
| categories: Counter[str] = Counter() | ||
|
|
||
| for row in rows: | ||
| if _is_correction(row): | ||
| total_corrections += 1 | ||
| categories[_normalize_category(row.get("category"))] += 1 | ||
| is_accepted = _is_graduation_accepted(row) | ||
| is_rejected = _is_rejection(row) | ||
| if is_accepted and not is_rejected: | ||
| accepted_graduations += 1 | ||
| elif is_rejected and not is_accepted: | ||
| rejection_count += 1 | ||
|
|
||
| denominator = accepted_graduations + rejection_count | ||
| acceptance_rate = round(accepted_graduations / denominator, 6) if denominator else 0.0 | ||
|
|
||
| top_categories = [ | ||
| {"category": name, "count": count} | ||
| for name, count in sorted(categories.items(), key=lambda item: (-item[1], item[0]))[:5] | ||
| ] | ||
|
|
||
| return { | ||
| "total_corrections": total_corrections, | ||
| "accepted_graduations": accepted_graduations, | ||
| "rejection_count": rejection_count, | ||
| "acceptance_rate": acceptance_rate, | ||
| "top_rule_categories": top_categories, | ||
| } | ||
|
|
||
|
|
||
| def _read_lines(path: str | None) -> list[str]: | ||
| if path: | ||
| with open(path, encoding="utf-8") as handle: | ||
| return handle.readlines() | ||
| return sys.stdin.readlines() | ||
|
|
||
|
|
||
| def main(argv: list[str] | None = None) -> int: | ||
| parser = argparse.ArgumentParser( | ||
| description="Compute correction-outcome aggregates for weekly trend snapshots." | ||
| ) | ||
| parser.add_argument("--input", help="Path to newline-delimited JSON input file") | ||
| args = parser.parse_args(argv) | ||
|
|
||
| lines = _read_lines(args.input) | ||
| rows, skipped_rows = parse_rows(lines) | ||
| snapshot = aggregate(rows) | ||
| snapshot["skipped_rows"] = skipped_rows | ||
|
|
||
| json.dump(snapshot, sys.stdout, sort_keys=True, separators=(",", ":")) | ||
| sys.stdout.write("\n") | ||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| raise SystemExit(main()) | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |||||
| import contextlib | ||||||
| import logging | ||||||
| import math | ||||||
| import os | ||||||
| import re # used by export functions for slug sanitization | ||||||
| import statistics | ||||||
| from datetime import UTC | ||||||
|
|
@@ -30,6 +31,11 @@ | |||||
| _STATE_RANK = {"INSTINCT": 0, "PATTERN": 1, "RULE": 2} | ||||||
| # Severity ordering for min_severity gating | ||||||
| _SEV_RANK = {"as-is": 0, "minor": 1, "moderate": 2, "major": 3, "discarded": 4} | ||||||
| _LOW_SIGNAL_EDIT_DISTANCE_FLOOR = 0.04 | ||||||
| # FORMAT/DRAFTING synonym swaps carry minimal signal; require a larger edit | ||||||
| # before recording a lesson so we don't learn from synonym-level noise. | ||||||
| _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR = 0.07 | ||||||
| _FORMAT_DRAFTING_CATEGORIES = frozenset({"FORMAT", "DRAFTING"}) | ||||||
|
|
||||||
| # Map evaluator dimension names to correction categories | ||||||
| _DIMENSION_CATEGORY_MAP = { | ||||||
|
|
@@ -54,6 +60,18 @@ def _filter_lessons_by_state(lessons, min_state: str = "PATTERN"): | |||||
| ] | ||||||
|
|
||||||
|
|
||||||
| def _is_meaningful_low_signal_change(draft: str, final: str, category: str) -> bool: | ||||||
| """Allow known-meaningful tiny edits to pass the low-signal floor.""" | ||||||
| cat = (category or "UNKNOWN").upper() | ||||||
| if cat in {"ACCURACY", "SECURITY"}: | ||||||
| return True | ||||||
| # Proper-noun/acronym capitalization fixes can carry meaning even when | ||||||
| # edit distance is tiny. | ||||||
| if draft != final and draft.lower() == final.lower(): | ||||||
| return bool(re.search(r"\b[A-Z]{2,}\b|\b[A-Z][a-z]{2,}\b", final)) | ||||||
| return False | ||||||
|
|
||||||
|
|
||||||
| # ── correct() ────────────────────────────────────────────────────────── | ||||||
|
|
||||||
|
|
||||||
|
|
@@ -99,7 +117,7 @@ def brain_correct( | |||||
| agent_type: str | None = None, | ||||||
| approval_required: bool = False, | ||||||
| dry_run: bool = False, | ||||||
| min_severity: str = "as-is", | ||||||
| min_severity: str = "minor", | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Revert the default Line 120 changes default gating to Proposed fix- min_severity: str = "minor",
+ min_severity: str = "as-is",📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| scope: str | None = None, | ||||||
| applies_to: str | None = None, | ||||||
| auto_heal: bool = False, | ||||||
|
|
@@ -353,8 +371,22 @@ def brain_correct( | |||||
| update_confidence, | ||||||
| ) | ||||||
|
|
||||||
| if not is_observation_dup and _SEV_RANK.get(diff.severity, 0) >= _SEV_RANK.get( | ||||||
| min_severity, 0 | ||||||
| _cat_upper = (category or "UNKNOWN").upper() | ||||||
| _ed_floor = ( | ||||||
| _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR | ||||||
| if _cat_upper in _FORMAT_DRAFTING_CATEGORIES | ||||||
| else _LOW_SIGNAL_EDIT_DISTANCE_FLOOR | ||||||
| ) | ||||||
| low_signal_filtered = ( | ||||||
| diff.severity in {"as-is", "minor"} | ||||||
| and diff.edit_distance < _ed_floor | ||||||
| and not _is_meaningful_low_signal_change(draft, final, category or "UNKNOWN") | ||||||
| ) | ||||||
| event["low_signal_filtered"] = low_signal_filtered | ||||||
| if ( | ||||||
| not is_observation_dup | ||||||
| and not low_signal_filtered | ||||||
| and _SEV_RANK.get(diff.severity, 0) >= _SEV_RANK.get(min_severity, 0) | ||||||
| ): | ||||||
| lessons_path = brain._find_lessons_path(create=True) | ||||||
| if lessons_path: | ||||||
|
|
@@ -1014,6 +1046,18 @@ def _lesson_key(lesson): | |||||
| if all_lessons: # guard against wiping lessons file when all lessons are killed | ||||||
| write_lessons_safe(lessons_path, format_lessons(all_lessons)) | ||||||
|
|
||||||
| # Auto-export AGENTS.md by default so post-graduation rules are | ||||||
| # available to AGENTS.md-aware tools without requiring a manual CLI step. | ||||||
| auto_export_agents = os.environ.get("GRADATA_AUTO_EXPORT_AGENTS", "1").strip().lower() | ||||||
| if auto_export_agents not in {"0", "false", "off", "no"}: | ||||||
| try: | ||||||
| from gradata.enhancements.rule_export import export_rules | ||||||
|
|
||||||
| agents_text = export_rules(brain.dir, target="agents", lessons_path=lessons_path) | ||||||
| (brain.dir / "AGENTS.md").write_text(agents_text, encoding="utf-8") | ||||||
| except Exception as e: | ||||||
| _log.debug("AGENTS.md auto-export skipped: %s", e) | ||||||
|
|
||||||
| # Archive graduated RULE lessons | ||||||
| new_rules = [ | ||||||
| l | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| """Experiment knobs for Beta-LB graduation gating. | ||
|
|
||
| These settings belong to GRA-210 and intentionally keep runtime behavior | ||
| backwards-compatible by default. Production default remains 0.75, while | ||
| `GRADATA_BETA_LB_THRESHOLD` can be set to `0.55` for the staged experiment. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import math | ||
| import os | ||
|
|
||
| # GRA-210: graduation_threshold experiment parameter for Beta-LB lower-bound checks. | ||
| GRA_210_EXPERIMENT = "GRA-210" | ||
| GRA_210_GRADUATION_THRESHOLD_ENV = "GRADATA_BETA_LB_THRESHOLD" | ||
| GRA_210_GRADUATION_THRESHOLD_DEFAULT = 0.75 | ||
|
|
||
|
|
||
| def read_beta_lb_threshold(default: float = GRA_210_GRADUATION_THRESHOLD_DEFAULT) -> float: | ||
| """Read the Beta-LB threshold override from env. | ||
|
|
||
| Returns a float clipped to [0.0, 1.0], or ``default`` when parsing fails. | ||
| """ | ||
|
|
||
| raw_value = os.environ.get(GRA_210_GRADUATION_THRESHOLD_ENV) | ||
| if raw_value is None: | ||
| return default | ||
|
|
||
| try: | ||
| threshold = float(raw_value) | ||
| except (TypeError, ValueError): | ||
| return default | ||
|
|
||
| if not math.isfinite(threshold): | ||
| return default | ||
|
|
||
| return min(max(threshold, 0.0), 1.0) | ||
|
Comment on lines
+19
to
+37
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial | ⚡ Quick win Normalize When env parsing fails (or env is unset), an out-of-range Proposed patch def read_beta_lb_threshold(default: float = GRA_210_GRADUATION_THRESHOLD_DEFAULT) -> float:
@@
- raw_value = os.environ.get(GRA_210_GRADUATION_THRESHOLD_ENV)
+ if not math.isfinite(default):
+ default = GRA_210_GRADUATION_THRESHOLD_DEFAULT
+ default = min(max(default, 0.0), 1.0)
+
+ raw_value = os.environ.get(GRA_210_GRADUATION_THRESHOLD_ENV)
if raw_value is None:
return default🤖 Prompt for AI Agents |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Graduation metrics are currently over-counted by non-graduation rows.
Lines 31–36 and 44–49 treat generic
outcome/status/acceptedfields as graduation outcomes without verifying the row is graduation-related, so unrelated events can skewaccepted_graduations/rejection_count.Proposed fix
Also applies to: 39-49
🤖 Prompt for AI Agents