From 8c365894b8590398669e2481bb616c9c615cc8a3 Mon Sep 17 00:00:00 2001
From: Chris Krough <461869+ckrough@users.noreply.github.com>
Date: Fri, 29 May 2026 07:43:33 -0400
Subject: [PATCH] fix: reject partial-zero classification dates (prof-5qy)

Classification dates are either the "00000000" no-date sentinel or a real YYYYMMDD calendar date with ASCII digits, year >= 1, month 01-12, and a day valid for that month with leap years honored. Partial-zero components ("20240900" day 00, "20240015" month 00, "00000901" year 0000), impossible dates (e.g. "20240230"), and non-ASCII digit characters collapse to the sentinel.

drover.dates exposes is_valid_classification_date() and normalize_classification_date(). The model boundary (RawClassification.date, ClassificationResult.date) carries a non-raising mode="before" field_validator that normalizes the LLM-supplied date before any downstream consumer (naming policy, tag actions, eval comparison, JSON export, on-disk reloads) reads it. NARA naming delegates to the shared normalizer. GroundTruthEntry.date raises on bad values; _load_ground_truth catches the resulting ValidationError and logs the offending line. The synthetic-sample generator validates against the same rule. The classification prompt instructs the model to emit the sentinel rather than zero-fill a single component.
---
 CLAUDE.md                            |  2 +
 README.md                            |  2 +-
 scripts/generate_eval_samples.py     | 12 ++---
 src/drover/dates.py                  | 70 ++++++++++++++++++++++++++
 src/drover/evaluation.py             | 25 +++++++++-
 src/drover/models.py                 | 30 ++++++++++-
 src/drover/naming/nara.py            | 27 ++++------
 src/drover/prompts/classification.md |  4 +-
 tests/test_dates.py                  | 75 ++++++++++++++++++++++++++++
 tests/test_evaluation.py             | 60 ++++++++++++++++++++++
 tests/test_models.py                 | 51 +++++++++++++++++++
 tests/test_naming.py                 | 27 ++++++++++
 12 files changed, 358 insertions(+), 27 deletions(-)
 create mode 100644 src/drover/dates.py
 create mode 100644 tests/test_dates.py
 create mode 100644 tests/test_evaluation.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 33b8d3f..aeb0e8c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -140,6 +140,8 @@ uv run bandit -r src/ -f json --severity-level medium --confidence-level medium
 - `ClassificationResult` → Final output with suggested_path
 - `ClassificationErrorResult` → Error response with error_code
 
+The `date` field on `RawClassification` and `ClassificationResult` carries a `mode="before"` normalizing validator that routes through `drover.dates.normalize_classification_date`. The model boundary always returns either a real `YYYYMMDD` calendar date (ASCII digits, leap-aware) or the `"00000000"` no-date sentinel; partial-zero components, impossible days, and non-ASCII digit characters collapse to the sentinel. Ground-truth entries (`evaluation.GroundTruthEntry.date`) reject the same set with a raising validator instead, so authored data fails at load.
+
 ### Configuration (`config.py`)
 Precedence: CLI options > config file > environment (DROVER_*) > defaults
 
diff --git a/README.md b/README.md
index 602049f..7d0b6c1 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Drover uses LLMs to analyze documents and suggest consistent, policy-compliant f
 - **Intelligent Classification** — Categorizes documents by domain, category, and document type
 - **Smart Sampling** — Adaptive page sampling for efficient processing of large documents
 - **Taxonomy System** — Extensible controlled vocabularies with strict or fallback modes
-- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains.
+- **NARA-Compliant Naming** — Generates standardized filenames: `{doctype}-{vendor}-{subject}-{entity}-{date}.pdf`. The `entity` slot (pet, patient, performer, brand) is optional and is dropped when empty, when it would duplicate the vendor, or for privacy-sensitive domains. The `date` slot is a real YYYYMMDD calendar date or the `00000000` no-date sentinel; partial-zero or impossible dates from the model collapse to the sentinel rather than entering the filename.
 - **macOS Tagging** — Apply classification as native filesystem tags
 - **Batch Processing** — Classify multiple documents with JSONL output
 - **Evaluation Framework** — Measure accuracy against ground truth datasets
diff --git a/scripts/generate_eval_samples.py b/scripts/generate_eval_samples.py
index 0fa2b47..1407839 100644
--- a/scripts/generate_eval_samples.py
+++ b/scripts/generate_eval_samples.py
@@ -31,6 +31,7 @@
 sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
 
 from drover.config import LogLevel
+from drover.dates import is_valid_classification_date
 from drover.logging import configure_logging, get_logger
 from drover.taxonomy.loader import get_taxonomy
 
@@ -220,12 +221,11 @@ class GroundTruthRow(BaseModel):
     @field_validator("date")
     @classmethod
     def _validate_date(cls, v: str) -> str:
-        if v == "00000000":
-            return v
-        if not re.fullmatch(r"\d{8}", v):
-            raise ValueError(f"date must be YYYYMMDD or '00000000', got {v!r}")
-        if "0000" in (v[:4], v[4:6], v[6:8]):
-            raise ValueError(f"partial-zero dates are forbidden, got {v!r}")
+        if not is_valid_classification_date(v):
+            raise ValueError(
+                "date must be the '00000000' sentinel or a real YYYYMMDD "
+                f"date, got {v!r}"
+            )
         return v
 
 
diff --git a/src/drover/dates.py b/src/drover/dates.py
new file mode 100644
index 0000000..4445e47
--- /dev/null
+++ b/src/drover/dates.py
@@ -0,0 +1,70 @@
+"""Validation and normalization for classification date strings.
+
+A classification date is either the ``00000000`` "no date" sentinel or an
+eight-digit ASCII ``YYYYMMDD`` string naming a real calendar day. Partial-zero
+dates (``20240900`` day 00, ``20240015`` month 00, ``00000901`` year 0000),
+impossible days (``20240230``), and non-ASCII digit shapes (fullwidth,
+Arabic-Indic) indicate a hallucinated or sloppily extracted date and are
+not accepted outside the sentinel.
+"""
+
+from datetime import date
+
+NO_DATE_SENTINEL = "00000000"
+
+_ASCII_DIGITS = frozenset("0123456789")
+
+
+def is_valid_classification_date(value: str) -> bool:
+    """Return whether ``value`` is the no-date sentinel or a real YYYYMMDD date.
+
+    Valid inputs are exactly ``"00000000"`` (no date available) or an
+    eight-character ASCII-digit ``YYYYMMDD`` string that names a real
+    calendar day: year >= 1, month 01-12, and a day valid for that month
+    with leap years honored. Anything else - partial-zero components,
+    impossible days like February 30, non-ASCII digit characters, wrong
+    lengths, or non-numeric strings - is rejected.
+
+    Args:
+        value: The date string to validate.
+
+    Returns:
+        True if the string is the sentinel or a real calendar date.
+    """
+    if value == NO_DATE_SENTINEL:
+        return True
+    if len(value) != 8 or not all(c in _ASCII_DIGITS for c in value):
+        return False
+    try:
+        date(int(value[:4]), int(value[4:6]), int(value[6:8]))
+    except ValueError:
+        return False
+    return True
+
+
+def normalize_classification_date(raw: str | None) -> str:
+    """Normalize a raw date string to a canonical YYYYMMDD or the sentinel.
+
+    Strips non-ASCII-digit characters, applies the 6-digit ``YYMMDD`` to
+    ``20YYMMDD`` expansion, truncates inputs of 8 or more digits to the
+    leading eight, and validates the result with
+    :func:`is_valid_classification_date`. Anything that fails validation
+    (including ``None`` and the empty string) collapses to the
+    :data:`NO_DATE_SENTINEL`, so callers can treat the return as always
+    safe to embed in a filename, tag, or downstream record.
+
+    Args:
+        raw: An LLM- or operator-supplied date string, or ``None``.
+
+    Returns:
+        A real YYYYMMDD date or :data:`NO_DATE_SENTINEL`.
+    """
+    if raw is None:
+        return NO_DATE_SENTINEL
+    digits = "".join(c for c in raw if c in _ASCII_DIGITS)
+    if len(digits) == 6:
+        digits = f"20{digits}"
+    candidate = digits[:8]
+    if is_valid_classification_date(candidate):
+        return candidate
+    return NO_DATE_SENTINEL
diff --git a/src/drover/evaluation.py b/src/drover/evaluation.py
index 27ce8c9..525bcfd 100644
--- a/src/drover/evaluation.py
+++ b/src/drover/evaluation.py
@@ -14,8 +14,9 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
+from drover.dates import is_valid_classification_date
 from drover.logging import get_logger
 
 if TYPE_CHECKING:
@@ -41,6 +42,28 @@ class GroundTruthEntry(BaseModel):
     entity: str | None = Field(default=None, description="Expected entity (optional)")
     notes: str | None = Field(default=None, description="Notes about this entry")
 
+    @field_validator("date")
+    @classmethod
+    def _validate_date(cls, value: str | None) -> str | None:
+        """Reject partial-zero or impossible ground-truth dates.
+
+        A ground-truth date must be the "00000000" sentinel or a real
+        YYYYMMDD calendar date. A partial-zero date (e.g. "20240900") in
+        authored ground truth is a data-entry bug. The validator raises
+        on bad values; ``_load_ground_truth`` catches the resulting
+        ``ValidationError`` and logs the offending line so a single bad
+        entry does not abort the whole load.
+        """
+        if value is None:
+            return value
+        if not is_valid_classification_date(value):
+            msg = (
+                "date must be the '00000000' sentinel or a real YYYYMMDD "
+                f"date, got {value!r}"
+            )
+            raise ValueError(msg)
+        return value
+
 
 @dataclass
 class ClassificationComparison:
diff --git a/src/drover/models.py b/src/drover/models.py
index 316c8a2..7d6fc60 100644
--- a/src/drover/models.py
+++ b/src/drover/models.py
@@ -3,7 +3,9 @@
 from enum import StrEnum
 from typing import TYPE_CHECKING, Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
+
+from drover.dates import normalize_classification_date
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -52,6 +54,18 @@ class ClassificationResult(BaseModel):
         default=None, description="AI metrics for this classification, if collected"
     )
 
+    @field_validator("date", mode="before")
+    @classmethod
+    def _normalize_date(cls, value: Any) -> str:
+        """Normalize the date at the model boundary, never raise.
+
+        Routes through :func:`drover.dates.normalize_classification_date`
+        so every downstream consumer (filename, tag actions, JSON export,
+        eval comparison) sees a real YYYYMMDD or the no-date sentinel,
+        never a partial-zero or non-ASCII-digit string.
+        """
+        return normalize_classification_date(value if isinstance(value, str) else None)
+
 
 class ClassificationErrorResult(BaseModel):
     """Error result when classification fails."""
@@ -93,6 +107,20 @@ class RawClassification(BaseModel):
         ),
     )
 
+    @field_validator("date", mode="before")
+    @classmethod
+    def _normalize_date(cls, value: Any) -> str:
+        """Normalize the LLM-supplied date at the model boundary, never raise.
+
+        Routes through :func:`drover.dates.normalize_classification_date`
+        so a hallucinated partial-zero or non-ASCII-digit date is coerced
+        to the no-date sentinel before any downstream consumer (naming
+        policy, tag action, eval comparison, JSON export) reads it. The
+        validator never raises, which keeps it compatible with LangChain's
+        ``with_structured_output`` retry loop.
+        """
+        return normalize_classification_date(value if isinstance(value, str) else None)
+
 
 class PathConstraints(BaseModel):
     """Constraints for generated file paths."""
diff --git a/src/drover/naming/nara.py b/src/drover/naming/nara.py
index 26c0aba..584582e 100644
--- a/src/drover/naming/nara.py
+++ b/src/drover/naming/nara.py
@@ -11,6 +11,7 @@
 
 from typing import ClassVar
 
+from drover.dates import normalize_classification_date
 from drover.naming.base import BaseNamingPolicy, NamingConstraints
 
 
@@ -120,26 +121,18 @@ def format_filename(
 
         return filename
 
-    def _normalize_date(self, date: str) -> str:
-        """Normalize date to YYYYMMDD format.
+    def _normalize_date(self, date: str | None) -> str:
+        """Normalize a date string to YYYYMMDD format.
 
-        Accepts various formats and normalizes to 8-digit date.
+        Delegates to :func:`drover.dates.normalize_classification_date`,
+        which strips non-ASCII-digit characters, expands 6-digit
+        ``YYMMDD`` inputs, and returns the ``"00000000"`` sentinel for
+        any partial-zero, impossible, or otherwise invalid date.
 
         Args:
-            date: Date string in various formats.
+            date: Date string in various formats, or ``None``.
 
         Returns:
-            Date in YYYYMMDD format, or "00000000" if unparseable.
+            A real YYYYMMDD date, or ``"00000000"`` if unparseable or invalid.
         """
-        digits = "".join(c for c in date if c.isdigit())
-
-        if len(digits) == 8:
-            return digits
-
-        if len(digits) == 6:
-            return f"20{digits}"
-
-        if len(digits) >= 8:
-            return digits[:8]
-
-        return "00000000"
+        return normalize_classification_date(date)
diff --git a/src/drover/prompts/classification.md b/src/drover/prompts/classification.md
index 2a278b4..1c2a8f4 100644
--- a/src/drover/prompts/classification.md
+++ b/src/drover/prompts/classification.md
@@ -46,6 +46,8 @@ Select the highest priority date found in the document:
 
 **If no date exists:** Use "00000000"
 
+**Never zero-fill a single component:** If the year, month, or day is unknown, use "00000000" for the entire date. Do not emit a partial-zero date such as "20240900" (unknown day) or "20240015" (unknown month) - these are invalid.
+
 ### Rule 2: Vendor Identification
 
 - Use the **full organization name** (e.g., "Northern Virginia Medical Center" not "NVMC")
@@ -159,7 +161,7 @@ If entity would equal the vendor, return "".
 Work through these steps internally before producing the structured output. Do NOT emit this analysis as text; the response is schema-constrained and must contain only the seven fields.
 
 1. **Extract evidence (cap your scan):** Note up to 5 organizations and up to 5 dates with their context. Prioritize letterhead, signature blocks, and the first and last pages over middle-of-document mentions. Note the document's structural form and the specific goods, services, or activities it covers.
-2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available.
+2. **Pick the date** by priority (transaction/service > statement/issue > due) and convert to YYYYMMDD. Use "00000000" if no date is available or if any single component (year, month, or day) is unknown - never zero-fill one component.
 3. **Pick the vendor** as the full issuing organization name, or "unknown".
 4. **Draft the subject** as 2-4 lowercase words describing content (not document form).
 5. **Pick the domain** by fundamental purpose. If "financial" is a candidate, explicitly check whether the financial aspect is merely transactional over a functional domain (medical, pets, property, vehicles, insurance, etc.).
diff --git a/tests/test_dates.py b/tests/test_dates.py
new file mode 100644
index 0000000..b093637
--- /dev/null
+++ b/tests/test_dates.py
@@ -0,0 +1,75 @@
+"""Tests for classification date validation and normalization."""
+
+import pytest
+
+from drover.dates import (
+    NO_DATE_SENTINEL,
+    is_valid_classification_date,
+    normalize_classification_date,
+)
+
+# Confusable-digit strings included verbatim so the test exercises exactly
+# the bytes a hallucinating LLM could emit. Both encode "20240115" using
+# non-ASCII digit code points and must be rejected by the validator.
+FULLWIDTH_DIGITS_DATE = "２０２４0115"  # noqa: RUF001
+ARABIC_INDIC_DIGITS_DATE = "٢٠٢٤٠١١٥"
+
+
+class TestIsValidClassificationDate:
+    """Truth table for the shared classification-date validator."""
+
+    @pytest.mark.parametrize(
+        "value",
+        [
+            "20240115",  # ordinary real date
+            "20240229",  # leap day in a leap year
+            NO_DATE_SENTINEL,  # the no-date sentinel
+        ],
+    )
+    def test_accepts_real_dates_and_sentinel(self, value: str) -> None:
+        assert is_valid_classification_date(value) is True
+
+    @pytest.mark.parametrize(
+        "value",
+        [
+            "20240900",  # day == 00
+            "20240015",  # month == 00
+            "00000901",  # year == 0000 (month/day valid)
+            "20240230",  # February 30 never exists
+            "20230229",  # not a leap year
+            "20241301",  # month 13
+            "2024011",  # too short
+            "202401155",  # too long
+            "2024-01-15",  # not eight digits
+            "abcdefgh",  # non-numeric
+            "",  # empty
+            FULLWIDTH_DIGITS_DATE,
+            ARABIC_INDIC_DIGITS_DATE,
+        ],
+    )
+    def test_rejects_partial_zero_and_impossible_dates(self, value: str) -> None:
+        assert is_valid_classification_date(value) is False
+
+
+class TestNormalizeClassificationDate:
+    """normalize_classification_date returns a real date or the sentinel."""
+
+    @pytest.mark.parametrize(
+        ("raw", "expected"),
+        [
+            ("20240115", "20240115"),  # passes through
+            ("240115", "20240115"),  # 6-digit YYMMDD expansion
+            (NO_DATE_SENTINEL, NO_DATE_SENTINEL),  # sentinel preserved
+            ("2024-01-15", "20240115"),  # strip separators
+            ("2024011500", "20240115"),  # >8 digits truncated to leading 8
+            ("20240900", NO_DATE_SENTINEL),  # day 00
+            ("20240015", NO_DATE_SENTINEL),  # month 00
+            ("00000901", NO_DATE_SENTINEL),  # year 0000
+            ("20240230", NO_DATE_SENTINEL),  # impossible day
+            ("", NO_DATE_SENTINEL),  # empty
+            (None, NO_DATE_SENTINEL),  # None never crashes
+            (FULLWIDTH_DIGITS_DATE, NO_DATE_SENTINEL),  # confusable digits rejected
+        ],
+    )
+    def test_normalize(self, raw: str | None, expected: str) -> None:
+        assert normalize_classification_date(raw) == expected
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
new file mode 100644
index 0000000..0f6bd84
--- /dev/null
+++ b/tests/test_evaluation.py
@@ -0,0 +1,60 @@
+"""Tests for the evaluation framework's data models."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+import pytest
+from pydantic import ValidationError
+
+from drover.dates import NO_DATE_SENTINEL
+from drover.evaluation import ClassificationEvaluator, GroundTruthEntry
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class TestGroundTruthEntryDateValidation:
+    """GroundTruthEntry rejects partial-zero and impossible dates."""
+
+    def _entry(self, date: str | None) -> GroundTruthEntry:
+        return GroundTruthEntry.model_validate(
+            {
+                "filename": "doc.pdf",
+                "domain": "financial",
+                "category": "banking",
+                "doctype": "statement",
+                "date": date,
+            }
+        )
+
+    @pytest.mark.parametrize("date", ["20240115", NO_DATE_SENTINEL, None])
+    def test_accepts_real_date_sentinel_and_missing(self, date: str | None) -> None:
+        assert self._entry(date).date == date
+
+    @pytest.mark.parametrize("date", ["20240900", "20240015", "00000901"])
+    def test_rejects_partial_zero_dates(self, date: str) -> None:
+        with pytest.raises(ValidationError):
+            self._entry(date)
+
+
+class TestGroundTruthLoaderSkipsBadDates:
+    """A partial-zero ground-truth line is skipped, not fatal to the load."""
+
+    def test_bad_date_line_is_skipped_not_fatal(self, tmp_path: Path) -> None:
+        good = {
+            "filename": "good.pdf",
+            "domain": "financial",
+            "category": "banking",
+            "doctype": "statement",
+            "date": "20240115",
+        }
+        bad = {**good, "filename": "bad.pdf", "date": "20240900"}
+        gt = tmp_path / "ground_truth.jsonl"
+        gt.write_text(json.dumps(good) + "\n" + json.dumps(bad) + "\n")
+
+        evaluator = ClassificationEvaluator(ground_truth_path=str(gt))
+
+        assert "good.pdf" in evaluator.ground_truth
+        assert "bad.pdf" not in evaluator.ground_truth
diff --git a/tests/test_models.py b/tests/test_models.py
index 1749baa..10452df 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,5 +1,8 @@
 """Tests for Pydantic models."""
 
+import pytest
+
+from drover.dates import NO_DATE_SENTINEL
 from drover.models import (
     ClassificationErrorResult,
     ClassificationResult,
@@ -7,6 +10,10 @@
     RawClassification,
 )
 
+# Confusable-digit string encoding "20240115" with fullwidth code points,
+# used to verify the model-boundary normalizer rejects non-ASCII digits.
+_FULLWIDTH_DIGITS_DATE = "２０２４0115"  # noqa: RUF001
+
 
 def test_classification_result_success():
     """Test creating a successful classification result.
@@ -95,3 +102,47 @@ def test_classification_result_entity_defaults_to_empty():
         subject="checking",
     )
     assert result.entity == ""
+
+
+@pytest.mark.parametrize(
+    ("raw_date", "expected"),
+    [
+        ("20240115", "20240115"),  # real date preserved
+        (NO_DATE_SENTINEL, NO_DATE_SENTINEL),  # sentinel preserved
+        ("20240900", NO_DATE_SENTINEL),  # day 00 normalized
+        ("20240015", NO_DATE_SENTINEL),  # month 00 normalized
+        ("00000901", NO_DATE_SENTINEL),  # year 0000 normalized
+        ("20240230", NO_DATE_SENTINEL),  # impossible day normalized
+        (_FULLWIDTH_DIGITS_DATE, NO_DATE_SENTINEL),  # confusable digits normalized
+        ("240115", "20240115"),  # 6-digit YYMMDD expanded
+    ],
+)
+def test_raw_classification_normalizes_date_at_boundary(
+    raw_date: str, expected: str
+) -> None:
+    """The LLM-supplied date is normalized before any downstream consumer reads it."""
+    raw = RawClassification(
+        domain="financial",
+        category="banking",
+        doctype="statement",
+        vendor="chase",
+        date=raw_date,
+        subject="checking",
+    )
+    assert raw.date == expected
+
+
+def test_classification_result_normalizes_date_at_boundary() -> None:
+    """ClassificationResult also normalizes its date for safe downstream reuse."""
+    result = ClassificationResult(
+        original="doc.pdf",
+        suggested_path="financial/banking/statement/doc.pdf",
+        suggested_filename="doc.pdf",
+        domain="financial",
+        category="banking",
+        doctype="statement",
+        vendor="chase",
+        date="20240900",  # day 00 from the LLM
+        subject="checking",
+    )
+    assert result.date == NO_DATE_SENTINEL
diff --git a/tests/test_naming.py b/tests/test_naming.py
index 0c48dbd..9a486a7 100644
--- a/tests/test_naming.py
+++ b/tests/test_naming.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from drover.dates import NO_DATE_SENTINEL
 from drover.naming import (
     NARAPolicyNaming,
     get_naming_loader,
@@ -86,6 +87,32 @@ def test_format_filename_date_normalization(self, policy: NARAPolicyNaming) -> N
         )
         assert "20240115" in filename
 
+    @pytest.mark.parametrize("bad_date", ["20240900", "20240015", "00000901"])
+    def test_format_filename_partial_zero_date_becomes_sentinel(
+        self, policy: NARAPolicyNaming, bad_date: str
+    ) -> None:
+        """Partial-zero dates normalize to the no-date sentinel, not the filename."""
+        filename = policy.format_filename(
+            doctype="statement",
+            vendor="test",
+            subject="test",
+            date=bad_date,
+            extension=".pdf",
+        )
+        assert bad_date not in filename
+        assert NO_DATE_SENTINEL in filename
+
+    def test_normalize_date_delegates_to_shared_normalizer(
+        self, policy: NARAPolicyNaming
+    ) -> None:
+        """_normalize_date routes invalid dates to the sentinel and preserves real dates."""
+        assert policy._normalize_date("20240900") == NO_DATE_SENTINEL
+        assert policy._normalize_date("20240230") == NO_DATE_SENTINEL
+        assert policy._normalize_date(None) == NO_DATE_SENTINEL
+        assert policy._normalize_date("20240115") == "20240115"
+        assert policy._normalize_date("240115") == "20240115"
+        assert policy._normalize_date(NO_DATE_SENTINEL) == NO_DATE_SENTINEL
+
     def test_format_filename_missing_extension_dot(
         self, policy: NARAPolicyNaming
     ) -> None: