From 8c365894b8590398669e2481bb616c9c615cc8a3 Mon Sep 17 00:00:00 2001 From: Chris Krough <461869+ckrough@users.noreply.github.com> Date: Fri, 29 May 2026 07:43:33 -0400 Subject: [PATCH] fix: reject partial-zero classification dates (prof-5qy) Classification dates are either the "00000000" no-date sentinel or a real YYYYMMDD calendar date with ASCII digits, year >= 1, month 01-12, and a day valid for that month with leap years honored. Partial-zero components ("20240900" day 00, "20240015" month 00, "00000901" year 0000), impossible dates (e.g. "20240230"), and non-ASCII digit characters collapse to the sentinel. drover.dates exposes is_valid_classification_date() and normalize_classification_date(). The model boundary (RawClassification.date, ClassificationResult.date) carries a non-raising mode="before" field_validator that normalizes the LLM-supplied date before any downstream consumer (naming policy, tag actions, eval comparison, JSON export, on-disk reloads) reads it. NARA naming delegates to the shared normalizer. GroundTruthEntry.date raises on bad values; _load_ground_truth catches the resulting ValidationError and logs the offending line. The synthetic-sample generator validates against the same rule. The classification prompt instructs the model to emit the sentinel rather than zero-fill a single component. --- CLAUDE.md | 2 + README.md | 2 +- scripts/generate_eval_samples.py | 12 ++--- src/drover/dates.py | 70 ++++++++++++++++++++++++++ src/drover/evaluation.py | 25 +++++++++- src/drover/models.py | 30 ++++++++++- src/drover/naming/nara.py | 27 ++++------ src/drover/prompts/classification.md | 4 +- tests/test_dates.py | 75 ++++++++++++++++++++++++++++ tests/test_evaluation.py | 60 ++++++++++++++++++++++ tests/test_models.py | 51 +++++++++++++++++++ tests/test_naming.py | 27 ++++++++++ 12 files changed, 358 insertions(+), 27 deletions(-) create mode 100644 src/drover/dates.py create mode 100644 tests/test_dates.py create mode 100644 tests/test_evaluation.py diff --git a/CLAUDE.md b/CLAUDE.md index 33b8d3f..aeb0e8c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -140,6 +140,8 @@ uv run bandit -r src/ -f json --severity-level medium --confidence-level medium - `ClassificationResult` → Final output with suggested_path - `ClassificationErrorResult` → Error response with error_code +The `date` field on `RawClassification` and `ClassificationResult` carries a `mode="before"` normalizing validator that routes through `drover.dates.normalize_classification_date`. The model boundary always returns either a real `YYYYMMDD` calendar date (ASCII digits, leap-aware) or the `"00000000"` no-date sentinel; partial-zero components, impossible days, and non-ASCII digit characters collapse to the sentinel. Ground-truth entries (`evaluation.GroundTruthEntry.date`) reject the same set with a raising validator instead, so authored data fails at load. + ### Configuration (`config.py`) Precedence: CLI options > config file > environment (DROVER_*) > defaults diff --git a/README.md b/README.md index 602049f..7d0b6c1 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Drover uses LLMs to analyze documents and suggest consistent, policy-compliant f - **Intelligent Classification** — Categorizes documents by domain, category, and document type - **Smart Sampling** — Adaptive page sampling for efficient processing of large documents - **Taxonomy System** — Extensible controlled vocabularies with strict or fallback modes -- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains. +- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains. The `date` slot is a real YYYYMMDD calendar date or the `00000000` no-date sentinel; partial-zero or impossible dates from the model collapse to the sentinel rather than entering the filename. - **macOS Tagging** — Apply classification as native filesystem tags - **Batch Processing** — Classify multiple documents with JSONL output - **Evaluation Framework** — Measure accuracy against ground truth datasets diff --git a/scripts/generate_eval_samples.py b/scripts/generate_eval_samples.py index 0fa2b47..1407839 100644 --- a/scripts/generate_eval_samples.py +++ b/scripts/generate_eval_samples.py @@ -31,6 +31,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from drover.config import LogLevel +from drover.dates import is_valid_classification_date from drover.logging import configure_logging, get_logger from drover.taxonomy.loader import get_taxonomy @@ -220,12 +221,11 @@ class GroundTruthRow(BaseModel): @field_validator("date") @classmethod def _validate_date(cls, v: str) -> str: - if v == "00000000": - return v - if not re.fullmatch(r"\d{8}", v): - raise ValueError(f"date must be YYYYMMDD or '00000000', got {v!r}") - if "0000" in (v[:4], v[4:6], v[6:8]): - raise ValueError(f"partial-zero dates are forbidden, got {v!r}") + if not is_valid_classification_date(v): + raise ValueError( + "date must be the '00000000' sentinel or a real YYYYMMDD " + f"date, got {v!r}" + ) return v diff --git a/src/drover/dates.py b/src/drover/dates.py new file mode 100644 index 0000000..4445e47 --- /dev/null +++ b/src/drover/dates.py @@ -0,0 +1,70 @@ +"""Validation and normalization for classification date strings. + +A classification date is either the ``00000000`` "no date" sentinel or an +eight-digit ASCII ``YYYYMMDD`` string naming a real calendar day. Partial-zero +dates (``20240900`` day 00, ``20240015`` month 00, ``00000901`` year 0000), +impossible days (``20240230``), and non-ASCII digit shapes (fullwidth, +Arabic-Indic) indicate a hallucinated or sloppily extracted date and are +not accepted outside the sentinel. +""" + +from datetime import date + +NO_DATE_SENTINEL = "00000000" + +_ASCII_DIGITS = frozenset("0123456789") + + +def is_valid_classification_date(value: str) -> bool: + """Return whether ``value`` is the no-date sentinel or a real YYYYMMDD date. + + Valid inputs are exactly ``"00000000"`` (no date available) or an + eight-character ASCII-digit ``YYYYMMDD`` string that names a real + calendar day: year >= 1, month 01-12, and a day valid for that month + with leap years honored. Anything else - partial-zero components, + impossible days like February 30, non-ASCII digit characters, wrong + lengths, or non-numeric strings - is rejected. + + Args: + value: The date string to validate. + + Returns: + True if the string is the sentinel or a real calendar date. + """ + if value == NO_DATE_SENTINEL: + return True + if len(value) != 8 or not all(c in _ASCII_DIGITS for c in value): + return False + try: + date(int(value[:4]), int(value[4:6]), int(value[6:8])) + except ValueError: + return False + return True + + +def normalize_classification_date(raw: str | None) -> str: + """Normalize a raw date string to a canonical YYYYMMDD or the sentinel. + + Strips non-ASCII-digit characters, applies the 6-digit ``YYMMDD`` to + ``20YYMMDD`` expansion, truncates inputs of 8 or more digits to the + leading eight, and validates the result with + :func:`is_valid_classification_date`. Anything that fails validation + (including ``None`` and the empty string) collapses to the + :data:`NO_DATE_SENTINEL`, so callers can treat the return as always + safe to embed in a filename, tag, or downstream record. + + Args: + raw: An LLM- or operator-supplied date string, or ``None``. + + Returns: + A real YYYYMMDD date or :data:`NO_DATE_SENTINEL`. + """ + if raw is None: + return NO_DATE_SENTINEL + digits = "".join(c for c in raw if c in _ASCII_DIGITS) + if len(digits) == 6: + digits = f"20{digits}" + candidate = digits[:8] + if is_valid_classification_date(candidate): + return candidate + return NO_DATE_SENTINEL diff --git a/src/drover/evaluation.py b/src/drover/evaluation.py index 27ce8c9..525bcfd 100644 --- a/src/drover/evaluation.py +++ b/src/drover/evaluation.py @@ -14,8 +14,9 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator +from drover.dates import is_valid_classification_date from drover.logging import get_logger if TYPE_CHECKING: @@ -41,6 +42,28 @@ class GroundTruthEntry(BaseModel): entity: str | None = Field(default=None, description="Expected entity (optional)") notes: str | None = Field(default=None, description="Notes about this entry") + @field_validator("date") + @classmethod + def _validate_date(cls, value: str | None) -> str | None: + """Reject partial-zero or impossible ground-truth dates. + + A ground-truth date must be the "00000000" sentinel or a real + YYYYMMDD calendar date. A partial-zero date (e.g. "20240900") in + authored ground truth is a data-entry bug. The validator raises + on bad values; ``_load_ground_truth`` catches the resulting + ``ValidationError`` and logs the offending line so a single bad + entry does not abort the whole load. + """ + if value is None: + return value + if not is_valid_classification_date(value): + msg = ( + "date must be the '00000000' sentinel or a real YYYYMMDD " + f"date, got {value!r}" + ) + raise ValueError(msg) + return value + @dataclass class ClassificationComparison: diff --git a/src/drover/models.py b/src/drover/models.py index 316c8a2..7d6fc60 100644 --- a/src/drover/models.py +++ b/src/drover/models.py @@ -3,7 +3,9 @@ from enum import StrEnum from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator + +from drover.dates import normalize_classification_date if TYPE_CHECKING: from pathlib import Path @@ -52,6 +54,18 @@ class ClassificationResult(BaseModel): default=None, description="AI metrics for this classification, if collected" ) + @field_validator("date", mode="before") + @classmethod + def _normalize_date(cls, value: Any) -> str: + """Normalize the date at the model boundary, never raise. + + Routes through :func:`drover.dates.normalize_classification_date` + so every downstream consumer (filename, tag actions, JSON export, + eval comparison) sees a real YYYYMMDD or the no-date sentinel, + never a partial-zero or non-ASCII-digit string. + """ + return normalize_classification_date(value if isinstance(value, str) else None) + class ClassificationErrorResult(BaseModel): """Error result when classification fails.""" @@ -93,6 +107,20 @@ class RawClassification(BaseModel): ), ) + @field_validator("date", mode="before") + @classmethod + def _normalize_date(cls, value: Any) -> str: + """Normalize the LLM-supplied date at the model boundary, never raise. + + Routes through :func:`drover.dates.normalize_classification_date` + so a hallucinated partial-zero or non-ASCII-digit date is coerced + to the no-date sentinel before any downstream consumer (naming + policy, tag action, eval comparison, JSON export) reads it. The + validator never raises, which keeps it compatible with LangChain's + ``with_structured_output`` retry loop. + """ + return normalize_classification_date(value if isinstance(value, str) else None) + class PathConstraints(BaseModel): """Constraints for generated file paths.""" diff --git a/src/drover/naming/nara.py b/src/drover/naming/nara.py index 26c0aba..584582e 100644 --- a/src/drover/naming/nara.py +++ b/src/drover/naming/nara.py @@ -11,6 +11,7 @@ from typing import ClassVar +from drover.dates import normalize_classification_date from drover.naming.base import BaseNamingPolicy, NamingConstraints @@ -120,26 +121,18 @@ def format_filename( return filename - def _normalize_date(self, date: str) -> str: - """Normalize date to YYYYMMDD format. + def _normalize_date(self, date: str | None) -> str: + """Normalize a date string to YYYYMMDD format. - Accepts various formats and normalizes to 8-digit date. + Delegates to :func:`drover.dates.normalize_classification_date`, + which strips non-ASCII-digit characters, expands 6-digit + ``YYMMDD`` inputs, and returns the ``"00000000"`` sentinel for + any partial-zero, impossible, or otherwise invalid date. Args: - date: Date string in various formats. + date: Date string in various formats, or ``None``. Returns: - Date in YYYYMMDD format, or "00000000" if unparseable. + A real YYYYMMDD date, or ``"00000000"`` if unparseable or invalid. """ - digits = "".join(c for c in date if c.isdigit()) - - if len(digits) == 8: - return digits - - if len(digits) == 6: - return f"20{digits}" - - if len(digits) >= 8: - return digits[:8] - - return "00000000" + return normalize_classification_date(date) diff --git a/src/drover/prompts/classification.md b/src/drover/prompts/classification.md index 2a278b4..1c2a8f4 100644 --- a/src/drover/prompts/classification.md +++ b/src/drover/prompts/classification.md @@ -46,6 +46,8 @@ Select the highest priority date found in the document: **If no date exists:** Use "00000000" +**Never zero-fill a single component:** If the year, month, or day is unknown, use "00000000" for the entire date. Do not emit a partial-zero date such as "20240900" (unknown day) or "20240015" (unknown month) - these are invalid. + ### Rule 2: Vendor Identification - Use the **full organization name** (e.g., "Northern Virginia Medical Center" not "NVMC") @@ -159,7 +161,7 @@ If entity would equal the vendor, return "". Work through these steps internally before producing the structured output. Do NOT emit this analysis as text; the response is schema-constrained and must contain only the seven fields. 1. **Extract evidence (cap your scan):** Note up to 5 organizations and up to 5 dates with their context. Prioritize letterhead, signature blocks, and the first and last pages over middle-of-document mentions. Note the document's structural form and the specific goods, services, or activities it covers. -2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available. +2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available or if any single component (year, month, or day) is unknown - never zero-fill one component. 3. **Pick the vendor** as the full issuing organization name, or "unknown". 4. **Draft the subject** as 2-4 lowercase words describing content (not document form). 5. **Pick the domain** by fundamental purpose. If "financial" is a candidate, explicitly check whether the financial aspect is merely transactional over a functional domain (medical, pets, property, vehicles, insurance, etc.). diff --git a/tests/test_dates.py b/tests/test_dates.py new file mode 100644 index 0000000..b093637 --- /dev/null +++ b/tests/test_dates.py @@ -0,0 +1,75 @@ +"""Tests for classification date validation and normalization.""" + +import pytest + +from drover.dates import ( + NO_DATE_SENTINEL, + is_valid_classification_date, + normalize_classification_date, +) + +# Confusable-digit strings included verbatim so the test exercises exactly +# the bytes a hallucinating LLM could emit. Both encode "20240115" using +# non-ASCII digit code points and must be rejected by the validator. +FULLWIDTH_DIGITS_DATE = "20240115" # noqa: RUF001 +ARABIC_INDIC_DIGITS_DATE = "٢٠٢٤٠١١٥" + + +class TestIsValidClassificationDate: + """Truth table for the shared classification-date validator.""" + + @pytest.mark.parametrize( + "value", + [ + "20240115", # ordinary real date + "20240229", # leap day in a leap year + NO_DATE_SENTINEL, # the no-date sentinel + ], + ) + def test_accepts_real_dates_and_sentinel(self, value: str) -> None: + assert is_valid_classification_date(value) is True + + @pytest.mark.parametrize( + "value", + [ + "20240900", # day == 00 + "20240015", # month == 00 + "00000901", # year == 0000 (month/day valid) + "20240230", # February 30 never exists + "20230229", # not a leap year + "20241301", # month 13 + "2024011", # too short + "202401155", # too long + "2024-01-15", # not eight digits + "abcdefgh", # non-numeric + "", # empty + FULLWIDTH_DIGITS_DATE, + ARABIC_INDIC_DIGITS_DATE, + ], + ) + def test_rejects_partial_zero_and_impossible_dates(self, value: str) -> None: + assert is_valid_classification_date(value) is False + + +class TestNormalizeClassificationDate: + """normalize_classification_date returns a real date or the sentinel.""" + + @pytest.mark.parametrize( + ("raw", "expected"), + [ + ("20240115", "20240115"), # passes through + ("240115", "20240115"), # 6-digit YYMMDD expansion + (NO_DATE_SENTINEL, NO_DATE_SENTINEL), # sentinel preserved + ("2024-01-15", "20240115"), # strip separators + ("2024011500", "20240115"), # >8 digits truncated to leading 8 + ("20240900", NO_DATE_SENTINEL), # day 00 + ("20240015", NO_DATE_SENTINEL), # month 00 + ("00000901", NO_DATE_SENTINEL), # year 0000 + ("20240230", NO_DATE_SENTINEL), # impossible day + ("", NO_DATE_SENTINEL), # empty + (None, NO_DATE_SENTINEL), # None never crashes + (FULLWIDTH_DIGITS_DATE, NO_DATE_SENTINEL), # confusable digits rejected + ], + ) + def test_normalize(self, raw: str | None, expected: str) -> None: + assert normalize_classification_date(raw) == expected diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py new file mode 100644 index 0000000..0f6bd84 --- /dev/null +++ b/tests/test_evaluation.py @@ -0,0 +1,60 @@ +"""Tests for the evaluation framework's data models.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pytest +from pydantic import ValidationError + +from drover.dates import NO_DATE_SENTINEL +from drover.evaluation import ClassificationEvaluator, GroundTruthEntry + +if TYPE_CHECKING: + from pathlib import Path + + +class TestGroundTruthEntryDateValidation: + """GroundTruthEntry rejects partial-zero and impossible dates.""" + + def _entry(self, date: str | None) -> GroundTruthEntry: + return GroundTruthEntry.model_validate( + { + "filename": "doc.pdf", + "domain": "financial", + "category": "banking", + "doctype": "statement", + "date": date, + } + ) + + @pytest.mark.parametrize("date", ["20240115", NO_DATE_SENTINEL, None]) + def test_accepts_real_date_sentinel_and_missing(self, date: str | None) -> None: + assert self._entry(date).date == date + + @pytest.mark.parametrize("date", ["20240900", "20240015", "00000901"]) + def test_rejects_partial_zero_dates(self, date: str) -> None: + with pytest.raises(ValidationError): + self._entry(date) + + +class TestGroundTruthLoaderSkipsBadDates: + """A partial-zero ground-truth line is skipped, not fatal to the load.""" + + def test_bad_date_line_is_skipped_not_fatal(self, tmp_path: Path) -> None: + good = { + "filename": "good.pdf", + "domain": "financial", + "category": "banking", + "doctype": "statement", + "date": "20240115", + } + bad = {**good, "filename": "bad.pdf", "date": "20240900"} + gt = tmp_path / "ground_truth.jsonl" + gt.write_text(json.dumps(good) + "\n" + json.dumps(bad) + "\n") + + evaluator = ClassificationEvaluator(ground_truth_path=str(gt)) + + assert "good.pdf" in evaluator.ground_truth + assert "bad.pdf" not in evaluator.ground_truth diff --git a/tests/test_models.py b/tests/test_models.py index 1749baa..10452df 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,5 +1,8 @@ """Tests for Pydantic models.""" +import pytest + +from drover.dates import NO_DATE_SENTINEL from drover.models import ( ClassificationErrorResult, ClassificationResult, @@ -7,6 +10,10 @@ RawClassification, ) +# Confusable-digit string encoding "20240115" with fullwidth code points, +# used to verify the model-boundary normalizer rejects non-ASCII digits. +_FULLWIDTH_DIGITS_DATE = "20240115" # noqa: RUF001 + def test_classification_result_success(): """Test creating a successful classification result. @@ -95,3 +102,47 @@ def test_classification_result_entity_defaults_to_empty(): subject="checking", ) assert result.entity == "" + + +@pytest.mark.parametrize( + ("raw_date", "expected"), + [ + ("20240115", "20240115"), # real date preserved + (NO_DATE_SENTINEL, NO_DATE_SENTINEL), # sentinel preserved + ("20240900", NO_DATE_SENTINEL), # day 00 normalized + ("20240015", NO_DATE_SENTINEL), # month 00 normalized + ("00000901", NO_DATE_SENTINEL), # year 0000 normalized + ("20240230", NO_DATE_SENTINEL), # impossible day normalized + (_FULLWIDTH_DIGITS_DATE, NO_DATE_SENTINEL), # confusable digits normalized + ("240115", "20240115"), # 6-digit YYMMDD expanded + ], +) +def test_raw_classification_normalizes_date_at_boundary( + raw_date: str, expected: str +) -> None: + """The LLM-supplied date is normalized before any downstream consumer reads it.""" + raw = RawClassification( + domain="financial", + category="banking", + doctype="statement", + vendor="chase", + date=raw_date, + subject="checking", + ) + assert raw.date == expected + + +def test_classification_result_normalizes_date_at_boundary() -> None: + """ClassificationResult also normalizes its date for safe downstream reuse.""" + result = ClassificationResult( + original="doc.pdf", + suggested_path="financial/banking/statement/doc.pdf", + suggested_filename="doc.pdf", + domain="financial", + category="banking", + doctype="statement", + vendor="chase", + date="20240900", # day 00 from the LLM + subject="checking", + ) + assert result.date == NO_DATE_SENTINEL diff --git a/tests/test_naming.py b/tests/test_naming.py index 0c48dbd..9a486a7 100644 --- a/tests/test_naming.py +++ b/tests/test_naming.py @@ -2,6 +2,7 @@ import pytest +from drover.dates import NO_DATE_SENTINEL from drover.naming import ( NARAPolicyNaming, get_naming_loader, @@ -86,6 +87,32 @@ def test_format_filename_date_normalization(self, policy: NARAPolicyNaming) -> N ) assert "20240115" in filename + @pytest.mark.parametrize("bad_date", ["20240900", "20240015", "00000901"]) + def test_format_filename_partial_zero_date_becomes_sentinel( + self, policy: NARAPolicyNaming, bad_date: str + ) -> None: + """Partial-zero dates normalize to the no-date sentinel, not the filename.""" + filename = policy.format_filename( + doctype="statement", + vendor="test", + subject="test", + date=bad_date, + extension=".pdf", + ) + assert bad_date not in filename + assert NO_DATE_SENTINEL in filename + + def test_normalize_date_delegates_to_shared_normalizer( + self, policy: NARAPolicyNaming + ) -> None: + """_normalize_date routes invalid dates to the sentinel and preserves real dates.""" + assert policy._normalize_date("20240900") == NO_DATE_SENTINEL + assert policy._normalize_date("20240230") == NO_DATE_SENTINEL + assert policy._normalize_date(None) == NO_DATE_SENTINEL + assert policy._normalize_date("20240115") == "20240115" + assert policy._normalize_date("240115") == "20240115" + assert policy._normalize_date(NO_DATE_SENTINEL) == NO_DATE_SENTINEL + def test_format_filename_missing_extension_dot( self, policy: NARAPolicyNaming ) -> None: