diff --git a/.gitignore b/.gitignore index 50ead2f9..39260d06 100644 --- a/.gitignore +++ b/.gitignore @@ -175,6 +175,7 @@ Gradata/scripts/* !Gradata/scripts/publish-npm.sh !Gradata/scripts/cloud/ !Gradata/scripts/migrate_legacy_scopes.py +!Gradata/scripts/weekly_correction_snapshot.py # npm sub-package build outputs (source tracked, outputs ignored) Gradata/packages/npm/node_modules/ diff --git a/Gradata/docs/weekly-correction-snapshot.md b/Gradata/docs/weekly-correction-snapshot.md new file mode 100644 index 00000000..f9609db8 --- /dev/null +++ b/Gradata/docs/weekly-correction-snapshot.md @@ -0,0 +1,36 @@ +# Weekly Correction Snapshot + +`scripts/weekly_correction_snapshot.py` builds a deterministic JSON summary from newline-delimited JSON (NDJSON) events. This is intended for weekly correction-outcome trend reporting. + +## Usage + +From file: + +```bash +python scripts/weekly_correction_snapshot.py --input /path/to/events.jsonl +``` + +From stdin: + +```bash +cat /path/to/events.jsonl | python scripts/weekly_correction_snapshot.py +``` + +## Output schema + +The script always emits one compact JSON object with stable key ordering: + +- `total_corrections` (int): count of correction events (`event=correction.created` or `kind=correction`) +- `accepted_graduations` (int): count of accepted graduation outcomes +- `rejection_count` (int): count of rejected graduation outcomes +- `acceptance_rate` (float): `accepted_graduations / (accepted_graduations + rejection_count)`, rounded to 6 decimals, or `0.0` if denominator is zero +- `top_rule_categories` (list): up to 5 entries sorted by descending count, then category name +- `skipped_rows` (int): malformed or non-object rows ignored during parsing + +`top_rule_categories` entries use: + +```json +{"category":"tone","count":12} +``` + +Category normalization is lowercase + trimmed whitespace. Empty/missing categories normalize to `"unknown"`. diff --git a/Gradata/pyproject.toml b/Gradata/pyproject.toml index 593e0b30..913097ef 100644 --- a/Gradata/pyproject.toml +++ b/Gradata/pyproject.toml @@ -165,7 +165,7 @@ skips = [ # --- Pytest --- [tool.pytest.ini_options] testpaths = ["tests"] -pythonpath = ["src"] +pythonpath = ["src", "scripts"] markers = [ "integration: tests that hit external LLM APIs (cost money, skip in CI)", "dualwrite: dual-write crash recovery and reconciliation tests", diff --git a/Gradata/scripts/weekly_correction_snapshot.py b/Gradata/scripts/weekly_correction_snapshot.py new file mode 100644 index 00000000..e4d64dea --- /dev/null +++ b/Gradata/scripts/weekly_correction_snapshot.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +"""Compute weekly correction/graduation aggregates from NDJSON events.""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from typing import Any + + +def _normalize_category(value: Any) -> str: + if value is None: + return "unknown" + normalized = str(value).strip().lower() + return normalized or "unknown" + + +def _is_correction(row: dict[str, Any]) -> bool: + event = str(row.get("event", "")).strip().lower() + kind = str(row.get("kind", "")).strip().lower() + return event == "correction.created" or kind == "correction" + + +def _is_graduation_accepted(row: dict[str, Any]) -> bool: + event = str(row.get("event", "")).strip().lower() + outcome = str(row.get("outcome", "")).strip().lower() + accepted_flag = row.get("accepted") + status = str(row.get("status", "")).strip().lower() + return ( + event in {"lesson.graduated", "graduation.accepted"} + or outcome == "accepted" + or accepted_flag is True + or status in {"accepted", "graduated"} + ) + + +def _is_rejection(row: dict[str, Any]) -> bool: + event = str(row.get("event", "")).strip().lower() + outcome = str(row.get("outcome", "")).strip().lower() + accepted_flag = row.get("accepted") + status = str(row.get("status", "")).strip().lower() + return ( + event in {"graduation.rejected", "lesson.rejected"} + or outcome == "rejected" + or accepted_flag is False + or status == "rejected" + ) + + +def parse_rows(lines: list[str]) -> tuple[list[dict[str, Any]], int]: + rows: list[dict[str, Any]] = [] + skipped = 0 + for raw in lines: + line = raw.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError: + skipped += 1 + continue + if not isinstance(row, dict): + skipped += 1 + continue + rows.append(row) + return rows, skipped + + +def aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]: + total_corrections = 0 + accepted_graduations = 0 + rejection_count = 0 + categories: Counter[str] = Counter() + + for row in rows: + if _is_correction(row): + total_corrections += 1 + categories[_normalize_category(row.get("category"))] += 1 + is_accepted = _is_graduation_accepted(row) + is_rejected = _is_rejection(row) + if is_accepted and not is_rejected: + accepted_graduations += 1 + elif is_rejected and not is_accepted: + rejection_count += 1 + + denominator = accepted_graduations + rejection_count + acceptance_rate = round(accepted_graduations / denominator, 6) if denominator else 0.0 + + top_categories = [ + {"category": name, "count": count} + for name, count in sorted(categories.items(), key=lambda item: (-item[1], item[0]))[:5] + ] + + return { + "total_corrections": total_corrections, + "accepted_graduations": accepted_graduations, + "rejection_count": rejection_count, + "acceptance_rate": acceptance_rate, + "top_rule_categories": top_categories, + } + + +def _read_lines(path: str | None) -> list[str]: + if path: + with open(path, encoding="utf-8") as handle: + return handle.readlines() + return sys.stdin.readlines() + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Compute correction-outcome aggregates for weekly trend snapshots." + ) + parser.add_argument("--input", help="Path to newline-delimited JSON input file") + args = parser.parse_args(argv) + + lines = _read_lines(args.input) + rows, skipped_rows = parse_rows(lines) + snapshot = aggregate(rows) + snapshot["skipped_rows"] = skipped_rows + + json.dump(snapshot, sys.stdout, sort_keys=True, separators=(",", ":")) + sys.stdout.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/Gradata/src/gradata/_core.py b/Gradata/src/gradata/_core.py index 2a9323aa..baa1cf1e 100644 --- a/Gradata/src/gradata/_core.py +++ b/Gradata/src/gradata/_core.py @@ -10,6 +10,7 @@ import contextlib import logging import math +import os import re # used by export functions for slug sanitization import statistics from datetime import UTC @@ -30,6 +31,11 @@ _STATE_RANK = {"INSTINCT": 0, "PATTERN": 1, "RULE": 2} # Severity ordering for min_severity gating _SEV_RANK = {"as-is": 0, "minor": 1, "moderate": 2, "major": 3, "discarded": 4} +_LOW_SIGNAL_EDIT_DISTANCE_FLOOR = 0.04 +# FORMAT/DRAFTING synonym swaps carry minimal signal; require a larger edit +# before recording a lesson so we don't learn from synonym-level noise. +_FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR = 0.07 +_FORMAT_DRAFTING_CATEGORIES = frozenset({"FORMAT", "DRAFTING"}) # Map evaluator dimension names to correction categories _DIMENSION_CATEGORY_MAP = { @@ -54,6 +60,18 @@ def _filter_lessons_by_state(lessons, min_state: str = "PATTERN"): ] +def _is_meaningful_low_signal_change(draft: str, final: str, category: str) -> bool: + """Allow known-meaningful tiny edits to pass the low-signal floor.""" + cat = (category or "UNKNOWN").upper() + if cat in {"ACCURACY", "SECURITY"}: + return True + # Proper-noun/acronym capitalization fixes can carry meaning even when + # edit distance is tiny. + if draft != final and draft.lower() == final.lower(): + return bool(re.search(r"\b[A-Z]{2,}\b|\b[A-Z][a-z]{2,}\b", final)) + return False + + # ── correct() ────────────────────────────────────────────────────────── @@ -99,7 +117,7 @@ def brain_correct( agent_type: str | None = None, approval_required: bool = False, dry_run: bool = False, - min_severity: str = "as-is", + min_severity: str = "minor", scope: str | None = None, applies_to: str | None = None, auto_heal: bool = False, @@ -353,8 +371,22 @@ def brain_correct( update_confidence, ) - if not is_observation_dup and _SEV_RANK.get(diff.severity, 0) >= _SEV_RANK.get( - min_severity, 0 + _cat_upper = (category or "UNKNOWN").upper() + _ed_floor = ( + _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR + if _cat_upper in _FORMAT_DRAFTING_CATEGORIES + else _LOW_SIGNAL_EDIT_DISTANCE_FLOOR + ) + low_signal_filtered = ( + diff.severity in {"as-is", "minor"} + and diff.edit_distance < _ed_floor + and not _is_meaningful_low_signal_change(draft, final, category or "UNKNOWN") + ) + event["low_signal_filtered"] = low_signal_filtered + if ( + not is_observation_dup + and not low_signal_filtered + and _SEV_RANK.get(diff.severity, 0) >= _SEV_RANK.get(min_severity, 0) ): lessons_path = brain._find_lessons_path(create=True) if lessons_path: @@ -1014,6 +1046,18 @@ def _lesson_key(lesson): if all_lessons: # guard against wiping lessons file when all lessons are killed write_lessons_safe(lessons_path, format_lessons(all_lessons)) + # Auto-export AGENTS.md by default so post-graduation rules are + # available to AGENTS.md-aware tools without requiring a manual CLI step. + auto_export_agents = os.environ.get("GRADATA_AUTO_EXPORT_AGENTS", "1").strip().lower() + if auto_export_agents not in {"0", "false", "off", "no"}: + try: + from gradata.enhancements.rule_export import export_rules + + agents_text = export_rules(brain.dir, target="agents", lessons_path=lessons_path) + (brain.dir / "AGENTS.md").write_text(agents_text, encoding="utf-8") + except Exception as e: + _log.debug("AGENTS.md auto-export skipped: %s", e) + # Archive graduated RULE lessons new_rules = [ l diff --git a/Gradata/src/gradata/enhancements/self_improvement/_graduation.py b/Gradata/src/gradata/enhancements/self_improvement/_graduation.py index 7949064f..f2a33ea0 100644 --- a/Gradata/src/gradata/enhancements/self_improvement/_graduation.py +++ b/Gradata/src/gradata/enhancements/self_improvement/_graduation.py @@ -28,6 +28,9 @@ graduation_thresholds, is_hook_enforced, ) +from gradata.enhancements.self_improvement._graduation_flags import ( + read_beta_lb_threshold, +) _log = logging.getLogger(__name__) @@ -105,7 +108,6 @@ def _read_beta_lb_config() -> tuple[bool, float, int]: Called once per ``graduate()`` invocation so per-lesson gate checks can skip repeated ``os.environ.get`` lookups inside the graduation loop. """ - import math import os enabled = os.environ.get("GRADATA_BETA_LB_GATE", "1").lower() not in ( @@ -115,15 +117,7 @@ def _read_beta_lb_config() -> tuple[bool, float, int]: "off", ) defaults = graduation_thresholds() - try: - threshold = float( - os.environ.get("GRADATA_BETA_LB_THRESHOLD", str(defaults.beta_lb_threshold)) - ) - if not math.isfinite(threshold): - threshold = defaults.beta_lb_threshold - threshold = min(max(threshold, 0.0), 1.0) - except (TypeError, ValueError): - threshold = defaults.beta_lb_threshold + threshold = read_beta_lb_threshold(defaults.beta_lb_threshold) try: min_fires = max( 0, diff --git a/Gradata/src/gradata/enhancements/self_improvement/_graduation_flags.py b/Gradata/src/gradata/enhancements/self_improvement/_graduation_flags.py new file mode 100644 index 00000000..27266614 --- /dev/null +++ b/Gradata/src/gradata/enhancements/self_improvement/_graduation_flags.py @@ -0,0 +1,37 @@ +"""Experiment knobs for Beta-LB graduation gating. + +These settings belong to GRA-210 and intentionally keep runtime behavior +backwards-compatible by default. Production default remains 0.75, while +`GRADATA_BETA_LB_THRESHOLD` can be set to `0.55` for the staged experiment. +""" + +from __future__ import annotations + +import math +import os + +# GRA-210: graduation_threshold experiment parameter for Beta-LB lower-bound checks. +GRA_210_EXPERIMENT = "GRA-210" +GRA_210_GRADUATION_THRESHOLD_ENV = "GRADATA_BETA_LB_THRESHOLD" +GRA_210_GRADUATION_THRESHOLD_DEFAULT = 0.75 + + +def read_beta_lb_threshold(default: float = GRA_210_GRADUATION_THRESHOLD_DEFAULT) -> float: + """Read the Beta-LB threshold override from env. + + Returns a float clipped to [0.0, 1.0], or ``default`` when parsing fails. + """ + + raw_value = os.environ.get(GRA_210_GRADUATION_THRESHOLD_ENV) + if raw_value is None: + return default + + try: + threshold = float(raw_value) + except (TypeError, ValueError): + return default + + if not math.isfinite(threshold): + return default + + return min(max(threshold, 0.0), 1.0) diff --git a/Gradata/tests/test_byo_key_provider.py b/Gradata/tests/test_byo_key_provider.py index 441d1c04..69b85f3b 100644 --- a/Gradata/tests/test_byo_key_provider.py +++ b/Gradata/tests/test_byo_key_provider.py @@ -1,7 +1,11 @@ from __future__ import annotations +import pytest + from gradata.llm.byo_key import BYOKeyProvider +pytest.importorskip("httpx") + class _Response: def __init__(self, payload: dict): diff --git a/Gradata/tests/test_dedup.py b/Gradata/tests/test_dedup.py index 71114412..fd9ca164 100644 --- a/Gradata/tests/test_dedup.py +++ b/Gradata/tests/test_dedup.py @@ -209,5 +209,170 @@ def test_brain_correct_annotates_fingerprint_and_seen_count(fresh_brain): assert result.get("observation_seen_count") == 1 +def test_brain_correct_semantic_near_duplicate_is_deduped(fresh_brain): + brain = fresh_brain + a1 = "We should probably maybe include the exact API endpoint in docs." + b1 = "Include the exact API endpoint in docs." + a2 = "We should maybe include the exact API endpoint in docs." + b2 = "Please include the exact API endpoint in docs." + + first = brain.correct(a1, b1, category="DRAFTING", session=2) + assert first.get("observation_deduped") is not True + + second = brain.correct(a2, b2, category="DRAFTING", session=2) + assert second.get("observation_deduped") is True + assert second.get("observation_dedup_reason") == "semantic" + + +def test_low_signal_floor_filters_tiny_non_meaningful_corrections(fresh_brain): + brain = fresh_brain + # Tiny punctuation-only edit; should be dropped by low-signal floor. + draft = "This sentence has enough tokens that a single punctuation mark is low-signal" + final = f"{draft}." + result = brain.correct(draft, final, category="FORMAT", session=3) + assert result.get("low_signal_filtered") is True + assert result.get("lessons_created", 0) == 0 + + +def test_dedup_does_not_inflate_lineage_correction_ids(fresh_brain): + brain = fresh_brain + draft = "we can maybe maybe probably ship this friday" + final = "We can ship this Friday." + + brain.correct(draft, final, category="DRAFTING", session=4) + for _ in range(5): + brain.correct(draft, final, category="DRAFTING", session=4) + + lessons = [l for l in brain._load_lessons() if l.category == "DRAFTING"] + assert lessons, "Expected at least one DRAFTING lesson" + lineages = [len(l.correction_event_ids) for l in lessons] + # Duplicate observations must not add new lineage IDs. + assert max(lineages) == 1 + + +# ── Cycle 3: category-aware noise filtering ────────────────────────────────── + + +def test_category_aware_floor_constants(): + """The FORMAT/DRAFTING floor (0.07) must be higher than the base floor (0.04).""" + from gradata._core import ( + _FORMAT_DRAFTING_CATEGORIES, + _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR, + _LOW_SIGNAL_EDIT_DISTANCE_FLOOR, + ) + + assert _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR == 0.07 + assert _FORMAT_DRAFTING_EDIT_DISTANCE_FLOOR > _LOW_SIGNAL_EDIT_DISTANCE_FLOOR + assert "FORMAT" in _FORMAT_DRAFTING_CATEGORIES + assert "DRAFTING" in _FORMAT_DRAFTING_CATEGORIES + assert "SECURITY" not in _FORMAT_DRAFTING_CATEGORIES + assert "ACCURACY" not in _FORMAT_DRAFTING_CATEGORIES + + +def test_format_drafting_floor_filters_medium_ed_corrections(fresh_brain): + """FORMAT/DRAFTING corrections with 0.04 ≤ ed < 0.07 are filtered as noise. + + Cycle-3 hypothesis: raise the low-signal floor to 0.07 for FORMAT/DRAFTING + so synonym-swap edits that slip past the old 0.04 floor are blocked. + We inject a controlled DiffResult (ed=0.05, severity='minor') so the test + is deterministic regardless of diff-engine tuning. + """ + from unittest.mock import patch + + from gradata.enhancements.diff_engine import DiffResult + + synthetic_diff = DiffResult( + edit_distance=0.05, + compression_distance=0.05, + changed_sections=[], + severity="minor", + summary_stats={}, + ) + + with patch("gradata.enhancements.diff_engine.compute_diff", return_value=synthetic_diff): + result = fresh_brain.correct( + "We should utilize the existing approach here.", + "We should use the existing approach here.", + category="DRAFTING", + session=10, + ) + + assert result.get("low_signal_filtered") is True, ( + "Expected DRAFTING correction with ed=0.05 (< 0.07 floor) to be filtered as noise" + ) + assert result.get("lessons_created", 0) == 0 + + +def test_format_floor_does_not_filter_above_threshold(fresh_brain): + """FORMAT/DRAFTING corrections with ed ≥ 0.07 must NOT be filtered.""" + from unittest.mock import patch + + from gradata.enhancements.diff_engine import DiffResult + + synthetic_diff = DiffResult( + edit_distance=0.08, + compression_distance=0.08, + changed_sections=[], + severity="minor", + summary_stats={}, + ) + + with patch("gradata.enhancements.diff_engine.compute_diff", return_value=synthetic_diff): + result = fresh_brain.correct( + "We should utilize this approach.", + "We should use this approach.", + category="FORMAT", + session=12, + ) + + assert result.get("low_signal_filtered") is not True, ( + "FORMAT correction with ed=0.08 (≥ 0.07 floor) must not be filtered" + ) + + +def test_format_drafting_floor_passes_larger_ed_corrections(fresh_brain): + """FORMAT/DRAFTING corrections with ed ≥ 0.07 must NOT be filtered.""" + brain = fresh_brain + # Multi-word restructure in DRAFTING; edit distance should exceed 0.07. + draft = "Maybe we could perhaps consider thinking about simplifying this." + final = "Simplify this." + result = brain.correct(draft, final, category="DRAFTING", session=11) + assert result.get("low_signal_filtered") is not True, ( + f"Expected substantial DRAFTING edit to pass; ed={result.get('edit_distance')}" + ) + + +def test_security_accuracy_always_pass_low_ed(fresh_brain): + """SECURITY/ACCURACY corrections pass regardless of severity or edit distance.""" + brain = fresh_brain + # Tiny but semantically critical: "not" inserted changes meaning entirely. + draft = "Users are allowed to delete other users' accounts." + final = "Users are not allowed to delete other users' accounts." + for cat in ("SECURITY", "ACCURACY"): + result = brain.correct(draft, final, category=cat, session=20) + assert result.get("low_signal_filtered") is not True, ( + f"{cat} correction was incorrectly filtered as low-signal" + ) + + +def test_non_format_drafting_keeps_original_floor(fresh_brain): + """Categories outside FORMAT/DRAFTING still use the 0.04 floor, not 0.07.""" + brain = fresh_brain + # TONE edit above 0.04 but below 0.07 — should NOT be filtered. + draft = ( + "We are unable to process your request at this moment in time and " + "we apologize for the inconvenience this has caused you today." + ) + final = ( + "We cannot process your request right now and we are sorry for the " + "inconvenience this has caused you today." + ) + result = brain.correct(draft, final, category="TONE", session=21) + # ed is likely above 0.04 for this rewrite; should pass as signal + assert result.get("low_signal_filtered") is not True, ( + f"TONE correction above 0.04 floor should not be filtered; ed={result.get('edit_distance')}" + ) + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/Gradata/tests/test_weekly_correction_snapshot.py b/Gradata/tests/test_weekly_correction_snapshot.py new file mode 100644 index 00000000..484fcd1c --- /dev/null +++ b/Gradata/tests/test_weekly_correction_snapshot.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import json + +from scripts import weekly_correction_snapshot as snapshot + + +def test_parse_rows_skips_malformed_and_non_object_rows(): + rows, skipped = snapshot.parse_rows( + [ + '{"event":"correction.created","category":"tone"}', + "not-json", + '["array-row"]', + "", + " ", + ] + ) + assert skipped == 2 + assert len(rows) == 1 + + +def test_aggregate_empty_input_has_zero_division_safe_defaults(): + data = snapshot.aggregate([]) + assert data["total_corrections"] == 0 + assert data["accepted_graduations"] == 0 + assert data["rejection_count"] == 0 + assert data["acceptance_rate"] == 0.0 + assert data["top_rule_categories"] == [] + + +def test_aggregate_counts_and_top_categories_deterministically(): + rows = [ + {"event": "correction.created", "category": "Tone"}, + {"event": "correction.created", "category": "tone"}, + {"event": "correction.created", "category": "factual"}, + {"event": "correction.created", "category": " PROCESS "}, + {"kind": "correction", "category": ""}, + {"event": "lesson.graduated"}, + {"event": "graduation.accepted"}, + {"outcome": "accepted"}, + {"event": "graduation.rejected"}, + {"accepted": False}, + ] + data = snapshot.aggregate(rows) + assert data["total_corrections"] == 5 + assert data["accepted_graduations"] == 3 + assert data["rejection_count"] == 2 + assert data["acceptance_rate"] == 0.6 + assert data["top_rule_categories"] == [ + {"category": "tone", "count": 2}, + {"category": "factual", "count": 1}, + {"category": "process", "count": 1}, + {"category": "unknown", "count": 1}, + ] + + +def test_main_emits_deterministic_json_with_skipped_rows(capsys, monkeypatch): + payload = ( + '{"event":"correction.created","category":"tone"}\n' + '{"event":"lesson.graduated"}\n' + '{"event":"graduation.rejected"}\n' + "bad-row\n" + ) + monkeypatch.setattr("sys.stdin.readlines", lambda: payload.splitlines(keepends=True)) + rc = snapshot.main([]) + assert rc == 0 + out = capsys.readouterr().out + result = json.loads(out) + assert result == { + "acceptance_rate": 0.5, + "accepted_graduations": 1, + "rejection_count": 1, + "skipped_rows": 1, + "top_rule_categories": [{"category": "tone", "count": 1}], + "total_corrections": 1, + } + + +def test_aggregate_treats_rows_as_single_outcome(): + rows = [ + {"event": "graduation.accepted", "outcome": "rejected", "accepted": True}, + {"event": "graduation.rejected", "outcome": "accepted", "accepted": False}, + {"accepted": True, "status": "rejected"}, + {"accepted": False, "status": "accepted"}, + ] + data = snapshot.aggregate(rows) + assert data["accepted_graduations"] == 0 + assert data["rejection_count"] == 0