From 84ecfcf7052382b94d4af107e95f2a689c76a17e Mon Sep 17 00:00:00 2001 From: Edoardo Panichi <40608771+Eddy1919@users.noreply.github.com> Date: Sat, 20 Jun 2026 13:36:50 +0200 Subject: [PATCH] feat(pelagios): tune Pleiades matcher against the live corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ran propose_pleiades_links over all 838 distinct findspots (5,932 inscriptions) pulled from the live API. Findings, now folded in: - Stem-prefix index in propose_links (prefix_len, default 3). A full difflib pass over the ~11k-place gazetteer does not finish; the indexed path runs in ~2s. prefix_len=0 forces the old full comparison. - Stopwords: add `cum`/`et` and museum/collection scaffolding (museo, publico, collezione, …). Recovers ~70 inscriptions that scored just under threshold ("Clusium cum agro", "Clusii in museo publico" → Clusium 1.0). - Default threshold 0.84 → 0.90. Sweep showed sub-0.90 recall is mostly wrong (Clusino GA.→lake Clusinus, Parisiis→Parsiana, "in fronte DA."). At 0.90: 75 findspots / 1,280 inscriptions (58%); exact-stem alone covers 53%. docs/PELAGIOS.md records the sweep + the open precision gaps (place-type disambiguation, non-findspot string filtering). - Tests: cum phrase, museum scaffolding, prefix-index vs full-scan agreement. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/PELAGIOS.md | 27 ++++++ .../data_pipeline/propose_pleiades_links.py | 8 +- src/openetruscan/core/gazetteer.py | 93 ++++++++++++++----- tests/test_gazetteer.py | 17 ++++ 4 files changed, 121 insertions(+), 24 deletions(-) diff --git a/docs/PELAGIOS.md b/docs/PELAGIOS.md index 042911e..ccda819 100644 --- a/docs/PELAGIOS.md +++ b/docs/PELAGIOS.md @@ -53,6 +53,33 @@ Design notes: `?has_provenance` / the search facets see it), run the existing reconcile/ enrichment path. +### Tuning (against the live corpus, 838 distinct findspots / 5,932 inscriptions) + +The default threshold is **0.90**, chosen from a coverage/precision sweep: + +| threshold | findspots matched | inscriptions covered | +| --- | --- | --- | +| 1.00 (exact stem) | 57 | 1,174 (53%) | +| 0.90 (default) | 75 | 1,280 (58%) | +| 0.84 (old default) | 90 | 1,328 (60%) | + +The extra recall below 0.90 is mostly *wrong*: `Clusino GA.` → the lake +*Clusinus* instead of the city, modern museum cities (`Parisiis` → `Parsiana`), +and position descriptors (`in fronte DA.`). Since this is a review queue, a +higher threshold keeps reviewer signal-to-noise high; lower it with +`--threshold` if you want to sweep the long tail by hand. + +Two findings fed back into the matcher: adding `cum` and museum/collection words +(`in museo publico`, `cum agro`) to the stopword set recovered ~70 inscriptions +that were scoring just under threshold; and matching is **stem-prefix indexed** +(`prefix_len`) because a full O(findspots·places) `difflib` pass over the ~11k +gazetteer places does not finish — the indexed path runs in ~2s. + +Known precision gaps still open (not threshold-fixable): **place-type +disambiguation** (prefer settlements over lakes/rivers — the gazetteer carries +feature types) and **non-findspot strings** (catalogue sigla like `GA.`/`FA.`, +pure museum provenance) that should be filtered before matching. + ## Time axis — PeriodO (done) Every dated inscription now links to a [PeriodO](https://perio.do) period diff --git a/scripts/data_pipeline/propose_pleiades_links.py b/scripts/data_pipeline/propose_pleiades_links.py index 88f060c..86dee4f 100644 --- a/scripts/data_pipeline/propose_pleiades_links.py +++ b/scripts/data_pipeline/propose_pleiades_links.py @@ -98,7 +98,13 @@ def main() -> None: ap.add_argument("--gazetteer", type=Path, default=DEFAULT_GAZETTEER) ap.add_argument("--mapping", type=Path, default=DEFAULT_MAPPING) ap.add_argument("--output", type=Path, default=DEFAULT_QUEUE) - ap.add_argument("--threshold", type=float, default=0.84, help="Min match score (default 0.84).") + ap.add_argument( + "--threshold", + type=float, + default=0.90, + help="Min match score (default 0.90, tuned against the live corpus; " + "0.84 admits false positives like 'Clusino GA.'→the lake Clusinus).", + ) ap.add_argument("--top-k", type=int, default=3, help="Max candidates per findspot.") ap.add_argument( "--include-empty", diff --git a/src/openetruscan/core/gazetteer.py b/src/openetruscan/core/gazetteer.py index 780eb05..c7af9b6 100644 --- a/src/openetruscan/core/gazetteer.py +++ b/src/openetruscan/core/gazetteer.py @@ -60,6 +60,21 @@ "of", "territorio", "territory", + # Conjunction in "Clusium cum agro" — without this the phrase scores + # below threshold and a 70-inscription findspot is missed. + "cum", + "et", + # Museum / collection scaffolding: real corpus findspots are often + # " in museo publico …" / "… apud ". Stripping the + # housing-location words recovers the underlying place. + "museo", + "museum", + "publico", + "pubblico", + "collezione", + "collection", + "coll", + "raccolta", } ) @@ -212,42 +227,74 @@ def best(self) -> LinkCandidate | None: PLEIADES_PLACE_URI = "https://pleiades.stoa.org/places/{}" +def _stem_prefix(text: str, prefix_len: int) -> str: + return stem_place_name(text).replace(" ", "")[:prefix_len] + + +def _build_name_index( + places: list[GazetteerPlace], prefix_len: int +) -> dict[str, list[tuple[GazetteerPlace, str]]]: + """Bucket every (place, name) by the name's stemmed prefix.""" + index: dict[str, list[tuple[GazetteerPlace, str]]] = {} + for place in places: + for name in place.all_names(): + index.setdefault(_stem_prefix(name, prefix_len), []).append((place, name)) + return index + + def propose_links( findspots: list[str], places: list[GazetteerPlace], *, - threshold: float = 0.84, + threshold: float = 0.90, top_k: int = 3, + prefix_len: int = 3, ) -> list[FindspotProposal]: """ Propose Pleiades links for each findspot. - For every findspot, scores it against every name of every gazetteer place, - keeps the best score per place, and returns up to ``top_k`` candidates at or - above ``threshold``, sorted by score descending. Findspots with no candidate - above threshold are returned with an empty candidate list so the caller can - distinguish "reviewed, no match" from "not yet attempted". + For every findspot, scores it against gazetteer names, keeps the best score + per place, and returns up to ``top_k`` candidates at or above ``threshold``, + sorted by score descending. Findspots with no candidate above threshold come + back with an empty candidate list so the caller can tell "reviewed, no match" + from "not yet attempted". + + Scoring is restricted to names sharing the findspot's first ``prefix_len`` + stemmed characters — a real gazetteer holds ~10k+ places and a full O(n·m) + comparison is infeasible (it does not finish on the live corpus; the indexed + path runs in ~2s). True toponym matches share their leading stem, so recall + loss is negligible. Set ``prefix_len=0`` to force a full comparison. + + The default ``threshold`` of 0.90 was tuned against the live corpus: at 0.90 + the proposal queue is mostly correct, while 0.84 admits systematic false + positives (e.g. "Clusino GA." → the *lake* Clusinus rather than the city). """ + if prefix_len > 0: + index = _build_name_index(places, prefix_len) + else: + all_pairs = [(place, name) for place in places for name in place.all_names()] + proposals: list[FindspotProposal] = [] for findspot in findspots: + if prefix_len > 0: + candidate_pairs = index.get(_stem_prefix(findspot, prefix_len), []) + else: + candidate_pairs = all_pairs + per_place: dict[str, LinkCandidate] = {} - for place in places: - best_for_place = 0.0 - best_name = "" - for name in place.all_names(): - s = score_match(findspot, name) - if s > best_for_place: - best_for_place, best_name = s, name - if best_for_place >= threshold: - existing = per_place.get(place.pleiades_id) - if existing is None or best_for_place > existing.score: - per_place[place.pleiades_id] = LinkCandidate( - pleiades_id=place.pleiades_id, - title=place.title, - score=round(best_for_place, 4), - matched_name=best_name, - uri=PLEIADES_PLACE_URI.format(place.pleiades_id), - ) + for place, name in candidate_pairs: + s = score_match(findspot, name) + if s < threshold: + continue + existing = per_place.get(place.pleiades_id) + if existing is None or s > existing.score: + per_place[place.pleiades_id] = LinkCandidate( + pleiades_id=place.pleiades_id, + title=place.title, + score=round(s, 4), + matched_name=name, + uri=PLEIADES_PLACE_URI.format(place.pleiades_id), + ) candidates = sorted(per_place.values(), key=lambda c: c.score, reverse=True)[:top_k] proposals.append(FindspotProposal(findspot=findspot, candidates=candidates)) return proposals diff --git a/tests/test_gazetteer.py b/tests/test_gazetteer.py index e87bf17..2ee03db 100644 --- a/tests/test_gazetteer.py +++ b/tests/test_gazetteer.py @@ -48,6 +48,14 @@ def test_strips_locative_scaffolding(self): def test_punctuation_becomes_separators(self): assert normalize_place_name("Clusii (in-agro)") == "clusii" + def test_strips_cum_conjunction(self): + # "Clusium cum agro" must reduce to the place, not score below threshold. + assert normalize_place_name("Clusium cum agro") == "clusium" + + def test_strips_museum_scaffolding(self): + assert normalize_place_name("Clusii in museo publico") == "clusii" + assert normalize_place_name("Volaterris in museo Guarnacci") == "volaterris guarnacci" + def test_empty(self): assert normalize_place_name("") == "" assert normalize_place_name(" ") == "" @@ -137,6 +145,15 @@ def test_batch_preserves_order_and_length(self): proposals = propose_links(findspots, GAZETTEER) assert [p.findspot for p in proposals] == findspots + def test_cum_phrase_matches_place(self): + [proposal] = propose_links(["Clusium cum agro"], GAZETTEER) + assert proposal.best is not None and proposal.best.pleiades_id == "413047" + + def test_prefix_index_and_full_scan_agree_on_real_match(self): + indexed = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=3) + full = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=0) + assert indexed[0].best.pleiades_id == full[0].best.pleiades_id == "413047" + @pytest.mark.parametrize( ("findspot", "expected_id"),