Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ uv run bandit -r src/ -f json --severity-level medium --confidence-level medium
- `ClassificationResult` → Final output with suggested_path
- `ClassificationErrorResult` → Error response with error_code

The `date` field on `RawClassification` and `ClassificationResult` carries a `mode="before"` normalizing validator that routes through `drover.dates.normalize_classification_date`. The model boundary always returns either a real `YYYYMMDD` calendar date (ASCII digits, leap-aware) or the `"00000000"` no-date sentinel; partial-zero components, impossible days, and non-ASCII digit characters collapse to the sentinel. Ground-truth entries (`evaluation.GroundTruthEntry.date`) reject the same set with a raising validator instead, so authored data fails at load.

### Configuration (`config.py`)
Precedence: CLI options > config file > environment (DROVER_*) > defaults

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Drover uses LLMs to analyze documents and suggest consistent, policy-compliant f
- **Intelligent Classification** — Categorizes documents by domain, category, and document type
- **Smart Sampling** — Adaptive page sampling for efficient processing of large documents
- **Taxonomy System** — Extensible controlled vocabularies with strict or fallback modes
- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains.
- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains. The `date` slot is a real YYYYMMDD calendar date or the `00000000` no-date sentinel; partial-zero or impossible dates from the model collapse to the sentinel rather than entering the filename.
- **macOS Tagging** — Apply classification as native filesystem tags
- **Batch Processing** — Classify multiple documents with JSONL output
- **Evaluation Framework** — Measure accuracy against ground truth datasets
Expand Down
12 changes: 6 additions & 6 deletions scripts/generate_eval_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from drover.config import LogLevel
from drover.dates import is_valid_classification_date
from drover.logging import configure_logging, get_logger
from drover.taxonomy.loader import get_taxonomy

Expand Down Expand Up @@ -220,12 +221,11 @@ class GroundTruthRow(BaseModel):
@field_validator("date")
@classmethod
def _validate_date(cls, v: str) -> str:
if v == "00000000":
return v
if not re.fullmatch(r"\d{8}", v):
raise ValueError(f"date must be YYYYMMDD or '00000000', got {v!r}")
if "0000" in (v[:4], v[4:6], v[6:8]):
raise ValueError(f"partial-zero dates are forbidden, got {v!r}")
if not is_valid_classification_date(v):
raise ValueError(
"date must be the '00000000' sentinel or a real YYYYMMDD "
f"date, got {v!r}"
)
return v


Expand Down
70 changes: 70 additions & 0 deletions src/drover/dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Validation and normalization for classification date strings.

A classification date is either the ``00000000`` "no date" sentinel or an
eight-digit ASCII ``YYYYMMDD`` string naming a real calendar day. Partial-zero
dates (``20240900`` day 00, ``20240015`` month 00, ``00000901`` year 0000),
impossible days (``20240230``), and non-ASCII digit shapes (fullwidth,
Arabic-Indic) indicate a hallucinated or sloppily extracted date and are
not accepted outside the sentinel.
"""

from datetime import date

NO_DATE_SENTINEL = "00000000"

_ASCII_DIGITS = frozenset("0123456789")


def is_valid_classification_date(value: str) -> bool:
"""Return whether ``value`` is the no-date sentinel or a real YYYYMMDD date.

Valid inputs are exactly ``"00000000"`` (no date available) or an
eight-character ASCII-digit ``YYYYMMDD`` string that names a real
calendar day: year >= 1, month 01-12, and a day valid for that month
with leap years honored. Anything else - partial-zero components,
impossible days like February 30, non-ASCII digit characters, wrong
lengths, or non-numeric strings - is rejected.

Args:
value: The date string to validate.

Returns:
True if the string is the sentinel or a real calendar date.
"""
if value == NO_DATE_SENTINEL:
return True
if len(value) != 8 or not all(c in _ASCII_DIGITS for c in value):
return False
try:
date(int(value[:4]), int(value[4:6]), int(value[6:8]))
except ValueError:
return False
return True


def normalize_classification_date(raw: str | None) -> str:
"""Normalize a raw date string to a canonical YYYYMMDD or the sentinel.

Strips non-ASCII-digit characters, applies the 6-digit ``YYMMDD`` to
``20YYMMDD`` expansion, truncates inputs of 8 or more digits to the
leading eight, and validates the result with
:func:`is_valid_classification_date`. Anything that fails validation
(including ``None`` and the empty string) collapses to the
:data:`NO_DATE_SENTINEL`, so callers can treat the return as always
safe to embed in a filename, tag, or downstream record.

Args:
raw: An LLM- or operator-supplied date string, or ``None``.

Returns:
A real YYYYMMDD date or :data:`NO_DATE_SENTINEL`.
"""
if raw is None:
return NO_DATE_SENTINEL
digits = "".join(c for c in raw if c in _ASCII_DIGITS)
if len(digits) == 6:
digits = f"20{digits}"
candidate = digits[:8]
if is_valid_classification_date(candidate):
return candidate
return NO_DATE_SENTINEL
25 changes: 24 additions & 1 deletion src/drover/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any

from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, field_validator

from drover.dates import is_valid_classification_date
from drover.logging import get_logger

if TYPE_CHECKING:
Expand All @@ -41,6 +42,28 @@ class GroundTruthEntry(BaseModel):
entity: str | None = Field(default=None, description="Expected entity (optional)")
notes: str | None = Field(default=None, description="Notes about this entry")

@field_validator("date")
@classmethod
def _validate_date(cls, value: str | None) -> str | None:
"""Reject partial-zero or impossible ground-truth dates.

A ground-truth date must be the "00000000" sentinel or a real
YYYYMMDD calendar date. A partial-zero date (e.g. "20240900") in
authored ground truth is a data-entry bug. The validator raises
on bad values; ``_load_ground_truth`` catches the resulting
``ValidationError`` and logs the offending line so a single bad
entry does not abort the whole load.
"""
if value is None:
return value
if not is_valid_classification_date(value):
msg = (
"date must be the '00000000' sentinel or a real YYYYMMDD "
f"date, got {value!r}"
)
raise ValueError(msg)
return value


@dataclass
class ClassificationComparison:
Expand Down
30 changes: 29 additions & 1 deletion src/drover/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from enum import StrEnum
from typing import TYPE_CHECKING, Any

from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, field_validator

from drover.dates import normalize_classification_date

if TYPE_CHECKING:
from pathlib import Path
Expand Down Expand Up @@ -52,6 +54,18 @@ class ClassificationResult(BaseModel):
default=None, description="AI metrics for this classification, if collected"
)

@field_validator("date", mode="before")
@classmethod
def _normalize_date(cls, value: Any) -> str:
"""Normalize the date at the model boundary, never raise.

Routes through :func:`drover.dates.normalize_classification_date`
so every downstream consumer (filename, tag actions, JSON export,
eval comparison) sees a real YYYYMMDD or the no-date sentinel,
never a partial-zero or non-ASCII-digit string.
"""
return normalize_classification_date(value if isinstance(value, str) else None)


class ClassificationErrorResult(BaseModel):
"""Error result when classification fails."""
Expand Down Expand Up @@ -93,6 +107,20 @@ class RawClassification(BaseModel):
),
)

@field_validator("date", mode="before")
@classmethod
def _normalize_date(cls, value: Any) -> str:
"""Normalize the LLM-supplied date at the model boundary, never raise.

Routes through :func:`drover.dates.normalize_classification_date`
so a hallucinated partial-zero or non-ASCII-digit date is coerced
to the no-date sentinel before any downstream consumer (naming
policy, tag action, eval comparison, JSON export) reads it. The
validator never raises, which keeps it compatible with LangChain's
``with_structured_output`` retry loop.
"""
return normalize_classification_date(value if isinstance(value, str) else None)


class PathConstraints(BaseModel):
"""Constraints for generated file paths."""
Expand Down
27 changes: 10 additions & 17 deletions src/drover/naming/nara.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from typing import ClassVar

from drover.dates import normalize_classification_date
from drover.naming.base import BaseNamingPolicy, NamingConstraints


Expand Down Expand Up @@ -120,26 +121,18 @@ def format_filename(

return filename

def _normalize_date(self, date: str) -> str:
"""Normalize date to YYYYMMDD format.
def _normalize_date(self, date: str | None) -> str:
"""Normalize a date string to YYYYMMDD format.

Accepts various formats and normalizes to 8-digit date.
Delegates to :func:`drover.dates.normalize_classification_date`,
which strips non-ASCII-digit characters, expands 6-digit
``YYMMDD`` inputs, and returns the ``"00000000"`` sentinel for
any partial-zero, impossible, or otherwise invalid date.

Args:
date: Date string in various formats.
date: Date string in various formats, or ``None``.

Returns:
Date in YYYYMMDD format, or "00000000" if unparseable.
A real YYYYMMDD date, or ``"00000000"`` if unparseable or invalid.
"""
digits = "".join(c for c in date if c.isdigit())

if len(digits) == 8:
return digits

if len(digits) == 6:
return f"20{digits}"

if len(digits) >= 8:
return digits[:8]

return "00000000"
return normalize_classification_date(date)
4 changes: 3 additions & 1 deletion src/drover/prompts/classification.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Select the highest priority date found in the document:

**If no date exists:** Use "00000000"

**Never zero-fill a single component:** If the year, month, or day is unknown, use "00000000" for the entire date. Do not emit a partial-zero date such as "20240900" (unknown day) or "20240015" (unknown month) - these are invalid.

### Rule 2: Vendor Identification

- Use the **full organization name** (e.g., "Northern Virginia Medical Center" not "NVMC")
Expand Down Expand Up @@ -159,7 +161,7 @@ If entity would equal the vendor, return "".
Work through these steps internally before producing the structured output. Do NOT emit this analysis as text; the response is schema-constrained and must contain only the seven fields.

1. **Extract evidence (cap your scan):** Note up to 5 organizations and up to 5 dates with their context. Prioritize letterhead, signature blocks, and the first and last pages over middle-of-document mentions. Note the document's structural form and the specific goods, services, or activities it covers.
2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available.
2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available or if any single component (year, month, or day) is unknown - never zero-fill one component.
3. **Pick the vendor** as the full issuing organization name, or "unknown".
4. **Draft the subject** as 2-4 lowercase words describing content (not document form).
5. **Pick the domain** by fundamental purpose. If "financial" is a candidate, explicitly check whether the financial aspect is merely transactional over a functional domain (medical, pets, property, vehicles, insurance, etc.).
Expand Down
75 changes: 75 additions & 0 deletions tests/test_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Tests for classification date validation and normalization."""

import pytest

from drover.dates import (
NO_DATE_SENTINEL,
is_valid_classification_date,
normalize_classification_date,
)

# Confusable-digit strings included verbatim so the test exercises exactly
# the bytes a hallucinating LLM could emit. Both encode "20240115" using
# non-ASCII digit code points and must be rejected by the validator.
FULLWIDTH_DIGITS_DATE = "20240115" # noqa: RUF001
ARABIC_INDIC_DIGITS_DATE = "٢٠٢٤٠١١٥"


class TestIsValidClassificationDate:
"""Truth table for the shared classification-date validator."""

@pytest.mark.parametrize(
"value",
[
"20240115", # ordinary real date
"20240229", # leap day in a leap year
NO_DATE_SENTINEL, # the no-date sentinel
],
)
def test_accepts_real_dates_and_sentinel(self, value: str) -> None:
assert is_valid_classification_date(value) is True

@pytest.mark.parametrize(
"value",
[
"20240900", # day == 00
"20240015", # month == 00
"00000901", # year == 0000 (month/day valid)
"20240230", # February 30 never exists
"20230229", # not a leap year
"20241301", # month 13
"2024011", # too short
"202401155", # too long
"2024-01-15", # not eight digits
"abcdefgh", # non-numeric
"", # empty
FULLWIDTH_DIGITS_DATE,
ARABIC_INDIC_DIGITS_DATE,
],
)
def test_rejects_partial_zero_and_impossible_dates(self, value: str) -> None:
assert is_valid_classification_date(value) is False


class TestNormalizeClassificationDate:
"""normalize_classification_date returns a real date or the sentinel."""

@pytest.mark.parametrize(
("raw", "expected"),
[
("20240115", "20240115"), # passes through
("240115", "20240115"), # 6-digit YYMMDD expansion
(NO_DATE_SENTINEL, NO_DATE_SENTINEL), # sentinel preserved
("2024-01-15", "20240115"), # strip separators
("2024011500", "20240115"), # >8 digits truncated to leading 8
("20240900", NO_DATE_SENTINEL), # day 00
("20240015", NO_DATE_SENTINEL), # month 00
("00000901", NO_DATE_SENTINEL), # year 0000
("20240230", NO_DATE_SENTINEL), # impossible day
("", NO_DATE_SENTINEL), # empty
(None, NO_DATE_SENTINEL), # None never crashes
(FULLWIDTH_DIGITS_DATE, NO_DATE_SENTINEL), # confusable digits rejected
],
)
def test_normalize(self, raw: str | None, expected: str) -> None:
assert normalize_classification_date(raw) == expected
Loading
Loading