From 84ecfcf7052382b94d4af107e95f2a689c76a17e Mon Sep 17 00:00:00 2001
From: Edoardo Panichi <40608771+Eddy1919@users.noreply.github.com>
Date: Sat, 20 Jun 2026 13:36:50 +0200
Subject: [PATCH] feat(pelagios): tune Pleiades matcher against the live corpus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ran propose_pleiades_links over all 838 distinct findspots (5,932
inscriptions) pulled from the live API. Findings, now folded in:

- Stem-prefix index in propose_links (prefix_len, default 3). A full
  difflib pass over the ~11k-place gazetteer does not finish; the indexed
  path runs in ~2s. prefix_len=0 forces the old full comparison.
- Stopwords: add `cum`/`et` and museum/collection scaffolding (museo,
  publico, collezione, …). Recovers ~70 inscriptions that scored just under
  threshold ("Clusium cum agro", "Clusii in museo publico" → Clusium 1.0).
- Default threshold 0.84 → 0.90. Sweep showed sub-0.90 recall is mostly
  wrong (Clusino GA.→lake Clusinus, Parisiis→Parsiana, "in fronte DA.").
  At 0.90: 75 findspots / 1,280 inscriptions (58%); exact-stem alone covers
  53%. docs/PELAGIOS.md records the sweep + the open precision gaps
  (place-type disambiguation, non-findspot string filtering).
- Tests: cum phrase, museum scaffolding, prefix-index vs full-scan agreement.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/PELAGIOS.md                              | 27 ++++++
 .../data_pipeline/propose_pleiades_links.py   |  8 +-
 src/openetruscan/core/gazetteer.py            | 93 ++++++++++++++-----
 tests/test_gazetteer.py                       | 17 ++++
 4 files changed, 121 insertions(+), 24 deletions(-)

diff --git a/docs/PELAGIOS.md b/docs/PELAGIOS.md
index 042911e..ccda819 100644
--- a/docs/PELAGIOS.md
+++ b/docs/PELAGIOS.md
@@ -53,6 +53,33 @@ Design notes:
   `?has_provenance` / the search facets see it), run the existing reconcile/
   enrichment path.
 
+### Tuning (against the live corpus, 838 distinct findspots / 5,932 inscriptions)
+
+The default threshold is **0.90**, chosen from a coverage/precision sweep:
+
+| threshold | findspots matched | inscriptions covered |
+| --- | --- | --- |
+| 1.00 (exact stem) | 57 | 1,174 (53%) |
+| 0.90 (default) | 75 | 1,280 (58%) |
+| 0.84 (old default) | 90 | 1,328 (60%) |
+
+The extra recall below 0.90 is mostly *wrong*: `Clusino GA.` → the lake
+*Clusinus* instead of the city, modern museum cities (`Parisiis` → `Parsiana`),
+and position descriptors (`in fronte DA.`). Since this is a review queue, a
+higher threshold keeps reviewer signal-to-noise high; lower it with
+`--threshold` if you want to sweep the long tail by hand.
+
+Two findings fed back into the matcher: adding `cum` and museum/collection words
+(`in museo publico`, `cum agro`) to the stopword set recovered ~70 inscriptions
+that were scoring just under threshold; and matching is **stem-prefix indexed**
+(`prefix_len`) because a full O(findspots·places) `difflib` pass over the ~11k
+gazetteer places does not finish — the indexed path runs in ~2s.
+
+Known precision gaps still open (not threshold-fixable): **place-type
+disambiguation** (prefer settlements over lakes/rivers — the gazetteer carries
+feature types) and **non-findspot strings** (catalogue sigla like `GA.`/`FA.`,
+pure museum provenance) that should be filtered before matching.
+
 ## Time axis — PeriodO (done)
 
 Every dated inscription now links to a [PeriodO](https://perio.do) period
diff --git a/scripts/data_pipeline/propose_pleiades_links.py b/scripts/data_pipeline/propose_pleiades_links.py
index 88f060c..86dee4f 100644
--- a/scripts/data_pipeline/propose_pleiades_links.py
+++ b/scripts/data_pipeline/propose_pleiades_links.py
@@ -98,7 +98,13 @@ def main() -> None:
     ap.add_argument("--gazetteer", type=Path, default=DEFAULT_GAZETTEER)
     ap.add_argument("--mapping", type=Path, default=DEFAULT_MAPPING)
     ap.add_argument("--output", type=Path, default=DEFAULT_QUEUE)
-    ap.add_argument("--threshold", type=float, default=0.84, help="Min match score (default 0.84).")
+    ap.add_argument(
+        "--threshold",
+        type=float,
+        default=0.90,
+        help="Min match score (default 0.90, tuned against the live corpus; "
+        "0.84 admits false positives like 'Clusino GA.'→the lake Clusinus).",
+    )
     ap.add_argument("--top-k", type=int, default=3, help="Max candidates per findspot.")
     ap.add_argument(
         "--include-empty",
diff --git a/src/openetruscan/core/gazetteer.py b/src/openetruscan/core/gazetteer.py
index 780eb05..c7af9b6 100644
--- a/src/openetruscan/core/gazetteer.py
+++ b/src/openetruscan/core/gazetteer.py
@@ -60,6 +60,21 @@
         "of",
         "territorio",
         "territory",
+        # Conjunction in "Clusium cum agro" — without this the phrase scores
+        # below threshold and a 70-inscription findspot is missed.
+        "cum",
+        "et",
+        # Museum / collection scaffolding: real corpus findspots are often
+        # "<place> in museo publico …" / "… apud <collector>". Stripping the
+        # housing-location words recovers the underlying place.
+        "museo",
+        "museum",
+        "publico",
+        "pubblico",
+        "collezione",
+        "collection",
+        "coll",
+        "raccolta",
     }
 )
 
@@ -212,42 +227,74 @@ def best(self) -> LinkCandidate | None:
 PLEIADES_PLACE_URI = "https://pleiades.stoa.org/places/{}"
 
 
+def _stem_prefix(text: str, prefix_len: int) -> str:
+    return stem_place_name(text).replace(" ", "")[:prefix_len]
+
+
+def _build_name_index(
+    places: list[GazetteerPlace], prefix_len: int
+) -> dict[str, list[tuple[GazetteerPlace, str]]]:
+    """Bucket every (place, name) by the name's stemmed prefix."""
+    index: dict[str, list[tuple[GazetteerPlace, str]]] = {}
+    for place in places:
+        for name in place.all_names():
+            index.setdefault(_stem_prefix(name, prefix_len), []).append((place, name))
+    return index
+
+
 def propose_links(
     findspots: list[str],
     places: list[GazetteerPlace],
     *,
-    threshold: float = 0.84,
+    threshold: float = 0.90,
     top_k: int = 3,
+    prefix_len: int = 3,
 ) -> list[FindspotProposal]:
     """
     Propose Pleiades links for each findspot.
 
-    For every findspot, scores it against every name of every gazetteer place,
-    keeps the best score per place, and returns up to ``top_k`` candidates at or
-    above ``threshold``, sorted by score descending. Findspots with no candidate
-    above threshold are returned with an empty candidate list so the caller can
-    distinguish "reviewed, no match" from "not yet attempted".
+    For every findspot, scores it against gazetteer names, keeps the best score
+    per place, and returns up to ``top_k`` candidates at or above ``threshold``,
+    sorted by score descending. Findspots with no candidate above threshold come
+    back with an empty candidate list so the caller can tell "reviewed, no match"
+    from "not yet attempted".
+
+    Scoring is restricted to names sharing the findspot's first ``prefix_len``
+    stemmed characters — a real gazetteer holds ~10k+ places and a full O(n·m)
+    comparison is infeasible (it does not finish on the live corpus; the indexed
+    path runs in ~2s). True toponym matches share their leading stem, so recall
+    loss is negligible. Set ``prefix_len=0`` to force a full comparison.
+
+    The default ``threshold`` of 0.90 was tuned against the live corpus: at 0.90
+    the proposal queue is mostly correct, while 0.84 admits systematic false
+    positives (e.g. "Clusino GA." → the *lake* Clusinus rather than the city).
     """
+    if prefix_len > 0:
+        index = _build_name_index(places, prefix_len)
+    else:
+        all_pairs = [(place, name) for place in places for name in place.all_names()]
+
     proposals: list[FindspotProposal] = []
     for findspot in findspots:
+        if prefix_len > 0:
+            candidate_pairs = index.get(_stem_prefix(findspot, prefix_len), [])
+        else:
+            candidate_pairs = all_pairs
+
         per_place: dict[str, LinkCandidate] = {}
-        for place in places:
-            best_for_place = 0.0
-            best_name = ""
-            for name in place.all_names():
-                s = score_match(findspot, name)
-                if s > best_for_place:
-                    best_for_place, best_name = s, name
-            if best_for_place >= threshold:
-                existing = per_place.get(place.pleiades_id)
-                if existing is None or best_for_place > existing.score:
-                    per_place[place.pleiades_id] = LinkCandidate(
-                        pleiades_id=place.pleiades_id,
-                        title=place.title,
-                        score=round(best_for_place, 4),
-                        matched_name=best_name,
-                        uri=PLEIADES_PLACE_URI.format(place.pleiades_id),
-                    )
+        for place, name in candidate_pairs:
+            s = score_match(findspot, name)
+            if s < threshold:
+                continue
+            existing = per_place.get(place.pleiades_id)
+            if existing is None or s > existing.score:
+                per_place[place.pleiades_id] = LinkCandidate(
+                    pleiades_id=place.pleiades_id,
+                    title=place.title,
+                    score=round(s, 4),
+                    matched_name=name,
+                    uri=PLEIADES_PLACE_URI.format(place.pleiades_id),
+                )
         candidates = sorted(per_place.values(), key=lambda c: c.score, reverse=True)[:top_k]
         proposals.append(FindspotProposal(findspot=findspot, candidates=candidates))
     return proposals
diff --git a/tests/test_gazetteer.py b/tests/test_gazetteer.py
index e87bf17..2ee03db 100644
--- a/tests/test_gazetteer.py
+++ b/tests/test_gazetteer.py
@@ -48,6 +48,14 @@ def test_strips_locative_scaffolding(self):
     def test_punctuation_becomes_separators(self):
         assert normalize_place_name("Clusii (in-agro)") == "clusii"
 
+    def test_strips_cum_conjunction(self):
+        # "Clusium cum agro" must reduce to the place, not score below threshold.
+        assert normalize_place_name("Clusium cum agro") == "clusium"
+
+    def test_strips_museum_scaffolding(self):
+        assert normalize_place_name("Clusii in museo publico") == "clusii"
+        assert normalize_place_name("Volaterris in museo Guarnacci") == "volaterris guarnacci"
+
     def test_empty(self):
         assert normalize_place_name("") == ""
         assert normalize_place_name("   ") == ""
@@ -137,6 +145,15 @@ def test_batch_preserves_order_and_length(self):
         proposals = propose_links(findspots, GAZETTEER)
         assert [p.findspot for p in proposals] == findspots
 
+    def test_cum_phrase_matches_place(self):
+        [proposal] = propose_links(["Clusium cum agro"], GAZETTEER)
+        assert proposal.best is not None and proposal.best.pleiades_id == "413047"
+
+    def test_prefix_index_and_full_scan_agree_on_real_match(self):
+        indexed = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=3)
+        full = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=0)
+        assert indexed[0].best.pleiades_id == full[0].best.pleiades_id == "413047"
+
 
 @pytest.mark.parametrize(
     ("findspot", "expected_id"),