Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/PELAGIOS.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,33 @@ Design notes:
`?has_provenance` / the search facets see it), run the existing reconcile/
enrichment path.

### Tuning (against the live corpus, 838 distinct findspots / 5,932 inscriptions)

The default threshold is **0.90**, chosen from a coverage/precision sweep:

| threshold | findspots matched | inscriptions covered |
| --- | --- | --- |
| 1.00 (exact stem) | 57 | 1,174 (53%) |
| 0.90 (default) | 75 | 1,280 (58%) |
| 0.84 (old default) | 90 | 1,328 (60%) |

The extra recall below 0.90 is mostly *wrong*: `Clusino GA.` → the lake
*Clusinus* instead of the city, modern museum cities (`Parisiis` → `Parsiana`),
and position descriptors (`in fronte DA.`). Since this is a review queue, a
higher threshold keeps reviewer signal-to-noise high; lower it with
`--threshold` if you want to sweep the long tail by hand.

Two findings fed back into the matcher: adding `cum` and museum/collection words
(`in museo publico`, `cum agro`) to the stopword set recovered ~70 inscriptions
that were scoring just under threshold; and matching is **stem-prefix indexed**
(`prefix_len`) because a full O(findspots·places) `difflib` pass over the ~11k
gazetteer places does not finish — the indexed path runs in ~2s.

Known precision gaps still open (not threshold-fixable): **place-type
disambiguation** (prefer settlements over lakes/rivers — the gazetteer carries
feature types) and **non-findspot strings** (catalogue sigla like `GA.`/`FA.`,
pure museum provenance) that should be filtered before matching.

## Time axis — PeriodO (done)

Every dated inscription now links to a [PeriodO](https://perio.do) period
Expand Down
8 changes: 7 additions & 1 deletion scripts/data_pipeline/propose_pleiades_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,13 @@ def main() -> None:
ap.add_argument("--gazetteer", type=Path, default=DEFAULT_GAZETTEER)
ap.add_argument("--mapping", type=Path, default=DEFAULT_MAPPING)
ap.add_argument("--output", type=Path, default=DEFAULT_QUEUE)
ap.add_argument("--threshold", type=float, default=0.84, help="Min match score (default 0.84).")
ap.add_argument(
"--threshold",
type=float,
default=0.90,
help="Min match score (default 0.90, tuned against the live corpus; "
"0.84 admits false positives like 'Clusino GA.'→the lake Clusinus).",
)
ap.add_argument("--top-k", type=int, default=3, help="Max candidates per findspot.")
ap.add_argument(
"--include-empty",
Expand Down
93 changes: 70 additions & 23 deletions src/openetruscan/core/gazetteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,21 @@
"of",
"territorio",
"territory",
# Conjunction in "Clusium cum agro" — without this the phrase scores
# below threshold and a 70-inscription findspot is missed.
"cum",
"et",
# Museum / collection scaffolding: real corpus findspots are often
# "<place> in museo publico …" / "… apud <collector>". Stripping the
# housing-location words recovers the underlying place.
"museo",
"museum",
"publico",
"pubblico",
"collezione",
"collection",
"coll",
"raccolta",
}
)

Expand Down Expand Up @@ -212,42 +227,74 @@ def best(self) -> LinkCandidate | None:
PLEIADES_PLACE_URI = "https://pleiades.stoa.org/places/{}"


def _stem_prefix(text: str, prefix_len: int) -> str:
return stem_place_name(text).replace(" ", "")[:prefix_len]


def _build_name_index(
places: list[GazetteerPlace], prefix_len: int
) -> dict[str, list[tuple[GazetteerPlace, str]]]:
"""Bucket every (place, name) by the name's stemmed prefix."""
index: dict[str, list[tuple[GazetteerPlace, str]]] = {}
for place in places:
for name in place.all_names():
index.setdefault(_stem_prefix(name, prefix_len), []).append((place, name))
return index


def propose_links(
findspots: list[str],
places: list[GazetteerPlace],
*,
threshold: float = 0.84,
threshold: float = 0.90,
top_k: int = 3,
prefix_len: int = 3,
) -> list[FindspotProposal]:
"""
Propose Pleiades links for each findspot.

For every findspot, scores it against every name of every gazetteer place,
keeps the best score per place, and returns up to ``top_k`` candidates at or
above ``threshold``, sorted by score descending. Findspots with no candidate
above threshold are returned with an empty candidate list so the caller can
distinguish "reviewed, no match" from "not yet attempted".
For every findspot, scores it against gazetteer names, keeps the best score
per place, and returns up to ``top_k`` candidates at or above ``threshold``,
sorted by score descending. Findspots with no candidate above threshold come
back with an empty candidate list so the caller can tell "reviewed, no match"
from "not yet attempted".

Scoring is restricted to names sharing the findspot's first ``prefix_len``
stemmed characters — a real gazetteer holds ~10k+ places and a full O(n·m)
comparison is infeasible (it does not finish on the live corpus; the indexed
path runs in ~2s). True toponym matches share their leading stem, so recall
loss is negligible. Set ``prefix_len=0`` to force a full comparison.

The default ``threshold`` of 0.90 was tuned against the live corpus: at 0.90
the proposal queue is mostly correct, while 0.84 admits systematic false
positives (e.g. "Clusino GA." → the *lake* Clusinus rather than the city).
"""
if prefix_len > 0:
index = _build_name_index(places, prefix_len)
else:
all_pairs = [(place, name) for place in places for name in place.all_names()]

proposals: list[FindspotProposal] = []
for findspot in findspots:
if prefix_len > 0:
candidate_pairs = index.get(_stem_prefix(findspot, prefix_len), [])
else:
candidate_pairs = all_pairs

per_place: dict[str, LinkCandidate] = {}
for place in places:
best_for_place = 0.0
best_name = ""
for name in place.all_names():
s = score_match(findspot, name)
if s > best_for_place:
best_for_place, best_name = s, name
if best_for_place >= threshold:
existing = per_place.get(place.pleiades_id)
if existing is None or best_for_place > existing.score:
per_place[place.pleiades_id] = LinkCandidate(
pleiades_id=place.pleiades_id,
title=place.title,
score=round(best_for_place, 4),
matched_name=best_name,
uri=PLEIADES_PLACE_URI.format(place.pleiades_id),
)
for place, name in candidate_pairs:
s = score_match(findspot, name)
if s < threshold:
continue
existing = per_place.get(place.pleiades_id)
if existing is None or s > existing.score:
per_place[place.pleiades_id] = LinkCandidate(
pleiades_id=place.pleiades_id,
title=place.title,
score=round(s, 4),
matched_name=name,
uri=PLEIADES_PLACE_URI.format(place.pleiades_id),
)
candidates = sorted(per_place.values(), key=lambda c: c.score, reverse=True)[:top_k]
proposals.append(FindspotProposal(findspot=findspot, candidates=candidates))
return proposals
17 changes: 17 additions & 0 deletions tests/test_gazetteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ def test_strips_locative_scaffolding(self):
def test_punctuation_becomes_separators(self):
assert normalize_place_name("Clusii (in-agro)") == "clusii"

def test_strips_cum_conjunction(self):
# "Clusium cum agro" must reduce to the place, not score below threshold.
assert normalize_place_name("Clusium cum agro") == "clusium"

def test_strips_museum_scaffolding(self):
assert normalize_place_name("Clusii in museo publico") == "clusii"
assert normalize_place_name("Volaterris in museo Guarnacci") == "volaterris guarnacci"

def test_empty(self):
assert normalize_place_name("") == ""
assert normalize_place_name(" ") == ""
Expand Down Expand Up @@ -137,6 +145,15 @@ def test_batch_preserves_order_and_length(self):
proposals = propose_links(findspots, GAZETTEER)
assert [p.findspot for p in proposals] == findspots

def test_cum_phrase_matches_place(self):
[proposal] = propose_links(["Clusium cum agro"], GAZETTEER)
assert proposal.best is not None and proposal.best.pleiades_id == "413047"

def test_prefix_index_and_full_scan_agree_on_real_match(self):
indexed = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=3)
full = propose_links(["Clusii in agro"], GAZETTEER, prefix_len=0)
assert indexed[0].best.pleiades_id == full[0].best.pleiades_id == "413047"


@pytest.mark.parametrize(
("findspot", "expected_id"),
Expand Down
Loading