From 1c1a5fa05d6b51756fe896818c60188f9f06642e Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 20 Oct 2022 16:33:09 +0200
Subject: [PATCH 01/38] WIP: speedup for entity embedding with .pipe();
 improvements to Wiki article extraction filters; switch to new DB structure
 for better support of full-text search with according changes in queries.

---
 benchmarks/nel/configs/nel.cfg                | 11 +--
 benchmarks/nel/project.yml                    |  2 +-
 .../candidate_generation/embeddings.py        |  3 +-
 benchmarks/nel/scripts/datasets/dataset.py    | 46 ++++++++++--
 benchmarks/nel/scripts/datasets/mewsli_9.py   |  4 +-
 benchmarks/nel/scripts/datasets/utils.py      |  6 +-
 benchmarks/nel/scripts/parse_wiki_dumps.py    |  8 +-
 benchmarks/nel/scripts/wiki/ddl.sql           | 24 ++++--
 benchmarks/nel/scripts/wiki/wiki_dump_api.py  | 55 +++++++-------
 benchmarks/nel/scripts/wiki/wikidata.py       | 19 ++++-
 benchmarks/nel/scripts/wiki/wikipedia.py      | 73 +++++++++++++++----
 11 files changed, 178 insertions(+), 73 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index e4aa86cd7..31fa5d26c 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -39,12 +39,13 @@ entity_vector_length = 64
 incl_context = true
 incl_prior = true
 labels_discard = []
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
 
-[components.entity_linker.get_candidates]
-@misc = "FuzzyStringGetCandidates.v1"
-dataset_name = "${paths.dataset_name}"
-max_n_candidates = 50
-similarity_cutoff = 0.5
+;[components.entity_linker.get_candidates]
+;@misc = "FuzzyStringGetCandidates.v1"
+;dataset_name = "${paths.dataset_name}"
+;max_n_candidates = 50
+;similarity_cutoff = 0.5
 
 [components.entity_linker.model]
 @architectures = "spacy.EntityLinker.v1"
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 6d1f20260..dd653f8e7 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -4,7 +4,7 @@ vars:
   run: "cg-default"
   config: "nel.cfg"
   vectors_model: "en_core_web_lg"
-  version: "0.0.4"
+  version: "0.0.5"
   dataset: "mewsli_9"
   gpu_id: ""
   use_filtered_dumps: "--use_filtered_dumps"
diff --git a/benchmarks/nel/scripts/candidate_generation/embeddings.py b/benchmarks/nel/scripts/candidate_generation/embeddings.py
index 95e97412f..9514c16ae 100644
--- a/benchmarks/nel/scripts/candidate_generation/embeddings.py
+++ b/benchmarks/nel/scripts/candidate_generation/embeddings.py
@@ -31,7 +31,8 @@ def _fetch_candidates(
         max_n_candidates: int,
         lexical_similarity_cutoff: float = 0.5,
     ) -> Iterable[int]:
-        target_vec = self._pipeline(span.text, disable=["parser", "senter", "ner"]).vector
+        target_vec = self._pipeline("Olympia Snowe", disable=["parser", "senter", "ner"]).vector
+        # target_vec = span.vector
         if not isinstance(target_vec, numpy.ndarray):
             target_vec = target_vec.get()
 
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index 2520e14ac..89c9ff61a 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -109,12 +109,28 @@ def create_knowledge_base(self, model_name: str, **kwargs) -> None:
         entity_list: List[str] = []
         count_list: List[int] = []
         vector_list: List[numpy.ndarray] = []  # type: ignore
-        for qid, info in self._entities.items():
+        qids = list(self._entities.keys())
+        desc_vectors = [
+            doc.vector for doc in
+            tqdm.tqdm(
+                self._nlp_base.pipe(
+                    texts=[
+                        self._entities[qid].description if self._entities[qid].description
+                        else (
+                            self._entities[qid].article_text[:500] if self._entities[qid].article_text
+                            else self._entities[qid].name
+                        )
+                        for qid in qids
+                    ],
+                    n_process=-1
+                ),
+                total=len(self._entities),
+                desc="Inferring entity embeddings"
+            )
+        ]
+        for qid, desc_vector in zip(qids, desc_vectors):
             entity_list.append(qid)
-            count_list.append(info.count)
-            desc_vector = self._nlp_base(
-                info.description if info.description else info.name
-            ).vector
+            count_list.append(self._entities[qid].count)
             vector_list.append(
                 desc_vector
                 if isinstance(desc_vector, numpy.ndarray)
@@ -280,12 +296,22 @@ def evaluate(self, run_name: str) -> None:
                 if ent.text.lower().startswith("the ") else ent
                 for ent in doc.ents
             ]
+        tmp_docs_path = "/tmp/docs.spacy"
+        if not os.path.exists(tmp_docs_path):
+            docs = tqdm.tqdm(
+                self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
+                desc="Inferring entities for test set",
+                total=len(docs)
+            )
+            DocBin(docs=docs).to_disk(tmp_docs_path)
+        else:
+            docs = list(DocBin().from_disk(tmp_docs_path).get_docs(self._nlp_best.vocab))
         test_set = [
             Example(predicted_doc, doc)
             for predicted_doc, doc in zip(
                 [
                     doc for doc in tqdm.tqdm(
-                        self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
+                        docs,  # self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
                         desc="Inferring entities for test set",
                         total=len(docs)
                     )
@@ -326,8 +352,16 @@ def evaluate(self, run_name: str) -> None:
                         ent_offset = (ent.start_char, ent.end_char)
                         # For the candidate generation evaluation also mis-aligned entities are considered.
                         label = ent_pred_labels.get(ent_offset, "NIL")
+                        a_cands = ent_cands.get(ent_offset, {})
+                        a_cands_aliases = {v.alias_ for c, v in ent_cands.get(ent_offset, {}).items()}
+                        a_cands_kb_ids = {v.entity_ for c, v in ent_cands.get(ent_offset, {}).items()}
+                        a_kb_id = ent.kb_id_
+                        a_ent = ent
+                        a_mention = example.reference.text[ent.start_char:ent.end_char]
                         cand_gen_label_counts[label] += 1
                         candidate_results.update_metrics(label, ent.kb_id_, set(ent_cands.get(ent_offset, {}).keys()))
+                        if len(a_cands_kb_ids) > 0 and ent.kb_id_ not in a_cands_kb_ids:
+                            x = 3
 
                 # Update entity disambiguation stats for baselines.
                 evaluation.add_disambiguation_baseline(
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
index 249ee10b9..d01b00702 100644
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ b/benchmarks/nel/scripts/datasets/mewsli_9.py
@@ -42,9 +42,7 @@ def _parse_external_corpus(
                     )
                 )
 
-        entities, failed_entity_lookups, _ = fetch_entity_information(
-            "id", tuple(entity_qids)
-        )
+        entities, failed_entity_lookups, _ = fetch_entity_information(tuple(entity_qids))
 
         return entities, failed_entity_lookups, annotations
 
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/scripts/datasets/utils.py
index 1e28c5402..743a6bf3e 100644
--- a/benchmarks/nel/scripts/datasets/utils.py
+++ b/benchmarks/nel/scripts/datasets/utils.py
@@ -24,13 +24,11 @@ def _does_token_overlap_with_annotation(
 
 
 def fetch_entity_information(
-    key: str,
     values: Tuple[str, ...],
     batch_size: int = 1000,
 ) -> Tuple[Dict[str, Entity], Set[str], Dict[str, str]]:
     """
     Fetches information on entities from database.
-    key (str): Attribute to match values to. Must be one of ("id", "name").
     values (Tuple[str]): Values for key to look up.
     db_conn (sqlite3.Connection): Database connection.
     batch_size (int): Number of entity titles to resolve in the same API request. Between 1 and 50.
@@ -47,14 +45,14 @@ def fetch_entity_information(
 
     for i in range(0, len(values), batch_size):
         chunk = tuple([v.replace("_", " ") for v in values[i : i + batch_size]])
-        entities_chunk = wiki_dump_api.load_entities(key, chunk)
+        entities_chunk = wiki_dump_api.load_entities(chunk)
         _failed_lookups = set(chunk)
 
         # Replace entity titles keys in dict with Wikidata QIDs. Add entity description.
         for entity in entities_chunk.values():
             entities[entity.qid] = entity
             name_qid_map[entity.name] = entity.qid
-            _failed_lookups.remove(entity.qid if key == "id" else entity.name)
+            _failed_lookups.remove(entity.qid)
 
         failed_lookups |= _failed_lookups
         pbar.update(len(chunk))
diff --git a/benchmarks/nel/scripts/parse_wiki_dumps.py b/benchmarks/nel/scripts/parse_wiki_dumps.py
index 7f6ea15da..8108400bf 100644
--- a/benchmarks/nel/scripts/parse_wiki_dumps.py
+++ b/benchmarks/nel/scripts/parse_wiki_dumps.py
@@ -26,4 +26,10 @@ def main(
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    # typer.run(main)
+    wiki_dump_api.parse(
+        entity_config={"limit": None},
+        article_text_config={"limit": None},
+        alias_prior_prob_config={"limit": None},
+        use_filtered_dumps=False
+    )
diff --git a/benchmarks/nel/scripts/wiki/ddl.sql b/benchmarks/nel/scripts/wiki/ddl.sql
index 7d2a2f54c..0be716164 100644
--- a/benchmarks/nel/scripts/wiki/ddl.sql
+++ b/benchmarks/nel/scripts/wiki/ddl.sql
@@ -2,25 +2,35 @@
 
 CREATE TABLE entities (
     id TEXT PRIMARY KEY NOT NULL,
-    name TEXT NOT NULL,
-    description TEXT,
-    label TEXT,
     -- This could be normalized. Not worth it at the moment though, since they aren't used.
     claims TEXT
 );
-CREATE UNIQUE INDEX idx_entities_name
-ON entities (name);
+
+-- The FTS5 virtual table implementation doesn't allow for indices, so we rely on ROWID to match entities.
+-- This isn't great, but in a controlled setup this allows for stable matching.
+-- Same for foreign keys.
+CREATE VIRTUAL TABLE entities_texts USING fts5(
+    entity_id UNINDEXED,
+    name,
+    description,
+    label
+);
 
 CREATE TABLE articles (
     entity_id TEXT PRIMARY KEY NOT NULL,
     id TEXT NOT NULL,
-    title TEXT NOT NULL,
-    text TEXT NOT NULL,
     FOREIGN KEY(entity_id) REFERENCES entities(id)
 );
 CREATE UNIQUE INDEX idx_articles_id
 ON articles (id);
 
+-- Same here: no indices possible, relying on ROWID to match with articles.
+CREATE VIRTUAL TABLE articles_texts USING fts5(
+    entity_id UNINDEXED,
+    title,
+    content
+);
+
 CREATE TABLE properties_in_entities (
     property_id TEXT NOT NULL,
     from_entity_id TEXT NOT NULL,
diff --git a/benchmarks/nel/scripts/wiki/wiki_dump_api.py b/benchmarks/nel/scripts/wiki/wiki_dump_api.py
index 741c758ff..077f5f0b1 100644
--- a/benchmarks/nel/scripts/wiki/wiki_dump_api.py
+++ b/benchmarks/nel/scripts/wiki/wiki_dump_api.py
@@ -61,7 +61,7 @@ def parse(
     """
 
     msg = "Database exists already. Execute `spacy project run delete_wiki_db` to remove it."
-    assert not os.path.exists(_paths["db"]), msg
+    # assert not os.path.exists(_paths["db"]), msg
 
     db_conn = db_conn if db_conn else establish_db_connection()
     with open(Path(os.path.abspath(__file__)).parent / "ddl.sql", "r") as ddl_sql:
@@ -73,31 +73,28 @@ def parse(
         **(entity_config if entity_config else {}),
     )
 
-    wikipedia.read_prior_probs(
-        _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
-        db_conn,
-        **(alias_prior_prob_config if alias_prior_prob_config else {}),
-    )
-
-    wikipedia.read_texts(
-        _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
-        db_conn,
-        **(article_text_config if article_text_config else {}),
-    )
+    # wikipedia.read_prior_probs(
+    #     _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
+    #     db_conn,
+    #     **(alias_prior_prob_config if alias_prior_prob_config else {}),
+    # )
+    #
+    # wikipedia.read_texts(
+    #     _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
+    #     db_conn,
+    #     **(article_text_config if article_text_config else {}),
+    # )
 
 
 def load_entities(
-    key: str, values: Tuple[str, ...], db_conn: Optional[sqlite3.Connection] = None
+    values: Tuple[str, ...], db_conn: Optional[sqlite3.Connection] = None
 ) -> Dict[str, Entity]:
     """Loads information for entity or entities by querying information from DB.
     Note that this doesn't return all available information, only the part used in the current benchmark solution.
-    key (str): Attribute to match values to. Must be one of ("id", "name").
     values (Tuple[str]): Values for key to look up.
     db_conn (Optional[sqlite3.Connection]): Database connection.
     RETURNS (Dict[str, Entity]): Information on requested entities.
     """
-
-    assert key in ("id", "name")
     db_conn = db_conn if db_conn else establish_db_connection()
 
     return {
@@ -123,28 +120,32 @@ def load_entities(
             f"""
                 SELECT 
                     e.id,
-                    e.name as entity_title,
-                    e.description,
-                    e.label,
-                    a.title as article_title,
-                    a.text,
+                    et.name as entity_title,
+                    et.description,
+                    et.label,
+                    at.title as article_title,
+                    at.content,
                     GROUP_CONCAT(afe.alias) as aliases,
                     SUM(afe.count) as count
                 FROM 
                     entities e
+                LEFT JOIN entities_texts et on
+                    et.ROWID = e.ROWID
                 LEFT JOIN articles a on
                     a.entity_id = e.id
+                LEFT JOIN articles_texts at on
+                    at.ROWID = a.ROWID
                 LEFT JOIN aliases_for_entities afe on
                     afe.entity_id = e.id                                         
                 WHERE 
-                    e.{key} IN (%s)
+                    e.id IN (%s)
                 GROUP BY
                     e.id,
-                    e.name,
-                    e.description,
-                    e.label,
-                    a.title,
-                    a.text
+                    et.name,
+                    et.description,
+                    et.label,
+                    at.title,
+                    at.content
             """
             % ",".join("?" * len(values)),
             tuple(set(values)),
diff --git a/benchmarks/nel/scripts/wiki/wikidata.py b/benchmarks/nel/scripts/wiki/wikidata.py
index 455b69665..9a40e57c2 100644
--- a/benchmarks/nel/scripts/wiki/wikidata.py
+++ b/benchmarks/nel/scripts/wiki/wikidata.py
@@ -86,6 +86,9 @@ def read_entities(
         "P279": exclude_list,  # subclass
     }
 
+    entity_ids_in_db: Set[str] = {
+        rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM entities")
+    }
     title_to_id: Dict[str, str] = {}
     id_to_attrs: Dict[str, Dict[str, Any]] = {}
 
@@ -96,6 +99,7 @@ def read_entities(
             desc="Parsing entity data", leave=True, miniters=1000, **pbar_params
         ) as pbar:
             for cnt, line in enumerate(file):
+                pbar.update(1)
                 if limit and cnt >= limit:
                     break
 
@@ -105,6 +109,8 @@ def read_entities(
 
                 if len(clean_line) > 1:
                     obj = json.loads(clean_line)
+                    if obj.get("id") in entity_ids_in_db:
+                        continue
                     entry_type = obj["type"]
 
                     if entry_type == "item":
@@ -218,17 +224,18 @@ def _write_to_db(
     """
 
     entities: List[Tuple[Optional[str], ...]] = []
+    entities_texts: List[Tuple[Optional[str], ...]] = []
     props_in_ents: Set[Tuple[str, str, str]] = set()
     aliases_for_entities: List[Tuple[str, str, int]] = []
 
     for title, qid in title_to_id.items():
-        entities.append(
+        entities.append((qid, json.dumps(id_to_attrs[qid]["claims"])))
+        entities_texts.append(
             (
                 qid,
                 title,
                 id_to_attrs[qid].get("description", None),
                 id_to_attrs[qid].get("labels", {}).get("value", None),
-                json.dumps(id_to_attrs[qid]["claims"]),
             )
         )
         for alias in id_to_attrs[qid]["aliases"]:
@@ -240,11 +247,15 @@ def _write_to_db(
 
     cur = db_conn.cursor()
     cur.executemany(
-        "INSERT OR IGNORE INTO entities (id, name, description, label, claims) VALUES (?, ?, ?, ?, ?)",
+        "INSERT INTO entities (id, claims) VALUES (?, ?)",
         entities,
     )
     cur.executemany(
-        "INSERT OR IGNORE INTO properties_in_entities (property_id, from_entity_id, to_entity_id) VALUES (?, ?, ?)",
+        "INSERT INTO entities_texts (entity_id, name, description, label) VALUES (?, ?, ?, ?)",
+        entities_texts,
+    )
+    cur.executemany(
+        "INSERT INTO properties_in_entities (property_id, from_entity_id, to_entity_id) VALUES (?, ?, ?)",
         props_in_ents,
     )
     cur.executemany(
diff --git a/benchmarks/nel/scripts/wiki/wikipedia.py b/benchmarks/nel/scripts/wiki/wikipedia.py
index 3f23d371d..a32f6a5cb 100644
--- a/benchmarks/nel/scripts/wiki/wikipedia.py
+++ b/benchmarks/nel/scripts/wiki/wikipedia.py
@@ -255,30 +255,44 @@ def read_texts(
         row["name"]: row["id"]
         for row in db_conn.execute("SELECT name, id FROM entities")
     }
-    records: List[Tuple[str, str, str, str]] = []
+    # records: List[Tuple[str, str, str, str]] = []
+    article_records: List[Tuple[str, str]] = []
+    article_texts_records: List[Tuple[str, str, str]] = []
     # Fetch IDs of entities whose articles are already in the DB.
     article_ids_in_db: Set[str] = {
         rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM articles")
     }
 
-    def write_to_db(_records: List[Tuple[str, str, str, str]]) -> None:
+    def write_to_db(_article_records: List[Tuple[str, str]], _article_text_records: List[Tuple[str, str, str]]) -> None:
         """Writes records to list.
-        _records (List[Tuple[str, str, str]]): Article triples with entity ID, title and text.
+        _article_records (List[Tuple[str, str]]): `articles`entries with entity ID, ID.
+        _article_texts_records (List[Tuple[str, str, str]]): `articles_texts` entries with entity ID, title, content.
         """
         db_conn.cursor().executemany(
-            "INSERT OR IGNORE INTO articles (entity_id, id, title, text) VALUES (?, ?, ?, ?)",
-            records,
+            "INSERT OR IGNORE INTO articles (entity_id, id) VALUES (?, ?)",
+            _article_records,
+        )
+        db_conn.cursor().executemany(
+            "INSERT OR IGNORE INTO articles_texts (entity_id, title, content) VALUES (?, ?, ?)",
+            _article_text_records
         )
         db_conn.commit()
 
     with bz2.open(wikipedia_input_path, mode="rb") as file:
         pbar_params = {"total": limit} if limit else {}
         with tqdm.tqdm(desc="Parsing article texts", miniters=1000, **pbar_params) as pbar:
+            n_articles = 0
+            n_viable_articles = 0
             article_text = ""
             article_title: Optional[str] = None
             article_id: Optional[str] = None
             reading_text = False
             reading_revision = False
+            # Terms in article indicating it should be skipped (for redirects and disambiguation pages).
+            # Note: checks for redirection/disambiguation articles are not language-agnostic. Porting this to the
+            # generalized extraction needs to consider that.
+            skip_terms = ("#redirect", "#redirection", "{{disambiguation}}")
+            skip_article = False
 
             for line in file:
                 if limit and pbar.n >= limit:
@@ -286,6 +300,16 @@ def write_to_db(_records: List[Tuple[str, str, str, str]]) -> None:
 
                 clean_line = line.strip().decode("utf-8")
 
+                # Check if article is to be skipped.
+                cl_lower = clean_line.lower()
+                for skip_term in skip_terms:
+                    if skip_term in cl_lower:
+                        skip_article = True
+                        break
+                # Skip to next line if article is to be skipped.
+                if skip_article and clean_line != "</page>":
+                    continue
+
                 if clean_line == "<revision>":
                     reading_revision = True
                 elif clean_line == "</revision>":
@@ -293,6 +317,7 @@ def write_to_db(_records: List[Tuple[str, str, str, str]]) -> None:
 
                 # Start reading new page
                 if clean_line == "<page>":
+                    n_articles += 1
                     article_text = ""
                     article_title = None
                     article_id = None
@@ -303,29 +328,42 @@ def write_to_db(_records: List[Tuple[str, str, str, str]]) -> None:
                         clean_text, entities = _process_wp_text(
                             article_title, article_text, entity_title_to_id
                         )
-                        if clean_text is not None and entities is not None:
+                        if clean_text is not None:
+                            n_viable_articles += 1
                             if article_title in entity_title_to_id:
-                                records.append(
+                                text_to_append = clean_text[:n_char_limit]
+                                for (to_replace, replacement) in (
+                                        ("(;", " "),
+                                        ("(,", " "),
+                                        (" ; ", " "),
+                                        (" , ", ""),
+                                        ("()", ""),
+                                ):
+                                    text_to_append = text_to_append.replace(
+                                        to_replace, replacement
+                                    )
+
+                                article_records.append((entity_title_to_id[article_title], article_id))
+                                article_texts_records.append(
                                     (
                                         entity_title_to_id[article_title],
-                                        article_id,
                                         article_title,
-                                        " ".join(
-                                            clean_text[:n_char_limit].split(" ")[:-1]
-                                        ),
+                                        " ".join(text_to_append.split(" ")[:-1]),
                                     )
                                 )
                             pbar.update(1)
 
                             if pbar.n % batch_size == 0:
-                                write_to_db(records)
-                                records = []
+                                write_to_db(article_records, article_texts_records)
+                                article_records = []
+                                article_texts_records = []
 
                     article_text = ""
                     article_title = None
                     article_id = None
                     reading_text = False
                     reading_revision = False
+                    skip_article = False
 
                 # start reading text within a page
                 if "<text" in clean_line:
@@ -355,7 +393,14 @@ def write_to_db(_records: List[Tuple[str, str, str, str]]) -> None:
                         article_title = titles[0].strip()
 
     if pbar.n % batch_size != 0:
-        write_to_db(records)
+        write_to_db(article_records, article_texts_records)
+
+    print(
+        f"Processed {n_articles} articles.\n  "
+        f"Of which viable (with article ID and text): {n_viable_articles} ({n_viable_articles / n_articles * 100:.2f}%)"
+        f"\n    "
+        f"Of which processed (title in entity table): {pbar.n} ({pbar.n / n_viable_articles * 100:.2f}%)"
+    )
 
 
 def extract_demo_dump(

From 6fd27f73b24f4803d243caf8115e0f44097649d8 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 20 Oct 2022 16:34:06 +0200
Subject: [PATCH 02/38] Remove redundant test code.

---
 benchmarks/nel/scripts/candidate_generation/embeddings.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/nel/scripts/candidate_generation/embeddings.py b/benchmarks/nel/scripts/candidate_generation/embeddings.py
index 9514c16ae..bd4f49987 100644
--- a/benchmarks/nel/scripts/candidate_generation/embeddings.py
+++ b/benchmarks/nel/scripts/candidate_generation/embeddings.py
@@ -31,8 +31,7 @@ def _fetch_candidates(
         max_n_candidates: int,
         lexical_similarity_cutoff: float = 0.5,
     ) -> Iterable[int]:
-        target_vec = self._pipeline("Olympia Snowe", disable=["parser", "senter", "ner"]).vector
-        # target_vec = span.vector
+        target_vec = span.vector
         if not isinstance(target_vec, numpy.ndarray):
             target_vec = target_vec.get()
 

From 785fb0d05d7d421666457ee15543695bbb8c31e4 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 10:45:13 +0200
Subject: [PATCH 03/38] Adjust to new DB structure.

---
 benchmarks/nel/.gitignore                     |   3 +-
 benchmarks/nel/configs/filter_terms.txt       |   2 -
 benchmarks/nel/project.yml                    |  44 +-
 .../scripts/candidate_generation/lexical.py   |   9 +
 benchmarks/nel/scripts/compile_corpora.py     |   2 +-
 benchmarks/nel/scripts/evaluate.py            |   3 +-
 .../nel/scripts/extract_demo_wiki_dump.py     |   6 -
 benchmarks/nel/scripts/parse_wiki_dumps.py    |  35 -
 benchmarks/nel/scripts/wiki/__init__.py       |   0
 benchmarks/nel/scripts/wiki/ddl.sql           |  55 --
 benchmarks/nel/scripts/wiki/download.sh       |   9 -
 benchmarks/nel/scripts/wiki/namespaces.py     | 129 ----
 benchmarks/nel/scripts/wiki/wiki_dump_api.py  | 199 ------
 benchmarks/nel/scripts/wiki/wikidata.py       | 313 ---------
 benchmarks/nel/scripts/wiki/wikipedia.py      | 656 ------------------
 .../tagger_parser_ud/configs/default.cfg      |   4 +-
 16 files changed, 36 insertions(+), 1433 deletions(-)
 delete mode 100644 benchmarks/nel/configs/filter_terms.txt
 delete mode 100644 benchmarks/nel/scripts/extract_demo_wiki_dump.py
 delete mode 100644 benchmarks/nel/scripts/parse_wiki_dumps.py
 delete mode 100644 benchmarks/nel/scripts/wiki/__init__.py
 delete mode 100644 benchmarks/nel/scripts/wiki/ddl.sql
 delete mode 100644 benchmarks/nel/scripts/wiki/download.sh
 delete mode 100644 benchmarks/nel/scripts/wiki/namespaces.py
 delete mode 100644 benchmarks/nel/scripts/wiki/wiki_dump_api.py
 delete mode 100644 benchmarks/nel/scripts/wiki/wikidata.py
 delete mode 100644 benchmarks/nel/scripts/wiki/wikipedia.py

diff --git a/benchmarks/nel/.gitignore b/benchmarks/nel/.gitignore
index 3ed09abe7..086665922 100644
--- a/benchmarks/nel/.gitignore
+++ b/benchmarks/nel/.gitignore
@@ -1 +1,2 @@
-assets/
\ No newline at end of file
+assets/
+wikid/
\ No newline at end of file
diff --git a/benchmarks/nel/configs/filter_terms.txt b/benchmarks/nel/configs/filter_terms.txt
deleted file mode 100644
index 961659726..000000000
--- a/benchmarks/nel/configs/filter_terms.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-New York
-Boston
\ No newline at end of file
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index dd653f8e7..24551fd16 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -4,34 +4,16 @@ vars:
   run: "cg-default"
   config: "nel.cfg"
   vectors_model: "en_core_web_lg"
-  version: "0.0.5"
+  version: "0.0.6"
   dataset: "mewsli_9"
   gpu_id: ""
-  use_filtered_dumps: "--use_filtered_dumps"
-  use_filter_terms: "--use_filter_terms"
+  download_all_wiki_assets: ""  # "--extra" to download full Wiki dumps.
+  filter: "--filter"  # Whether to only use parts of Wiki data and corpus containing filter terms.
   training_max_steps: 1000
   eval_highlight_metric: "F"  # one of ("F", "r", "p")
 
 directories: ["assets", "training", "configs", "scripts", "corpora", "temp", "evaluation"]
 
-assets:
-  - dest: 'assets/wiki/wikidata_entity_dump.json.bz2'
-    url: 'https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2'
-    description: Wikidata entity dump. Download can take a long time!
-    extra: True
-  - dest: 'assets/wiki/wikipedia_dump.xml.bz2'
-    url: 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2'
-    description: Wikipedia dump. Download can take a long time!
-    extra: True
-  - dest: 'assets/wiki/wikidata_entity_dump_filtered.json.bz2'
-    url: 'https://github.com/explosion/projects/releases/download/nel-benchmark-filtered-wiki-data/wikidata_entity_dump_filtered.json.bz2'
-    description: Filtered Wikidata entity dump for demo purposes.
-    checksum: 'ba2d979105abf174208608b942242fcb'
-  - dest: 'assets/wiki/wikipedia_dump_filtered.xml.bz2'
-    url: 'https://github.com/explosion/projects/releases/download/nel-benchmark-filtered-wiki-data/wikipedia_dump_filtered.xml.bz2'
-    description: Filtered Wikipedia dump for demo purposes.
-    checksum: 'cb624eaa5887fe1ff47a9206c9bdcfd8'
-
 workflows:
   all:
     - download_mewsli9
@@ -69,10 +51,24 @@ commands:
     script:
       - "python -m spacy download ${vars.vectors_model}"
 
-  - name: parse_wiki_dumps
+  - name: wiki_clone_wikid
+    help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
+    script:
+      # To be updated after merge.
+      - git clone https://github.com/rmitsch/wikid.git && cd wikid && git checkout feature/wiki-parsing && cd ..
+      - pip install -r wikid/requirements.txt
+
+  - name: wiki_download_assets
+    help: "Download Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
+    script:
+      - "spacy project assets wikid ${vars.download_all_wiki_assets}"
+    outputs:
+      - "wikid/assets/"
+
+  - name: wiki_parse
     help: "Parse Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
     script:
-      - "env PYTHONPATH=scripts python ./scripts/parse_wiki_dumps.py ${vars.use_filtered_dumps}"
+      - "spacy project run parse wikid --vars.use_filtered_dumps ${vars.filter}"
     outputs:
       - "assets/wiki/wiki.sqlite3"
 
@@ -94,7 +90,7 @@ commands:
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
-      - "python ./scripts/compile_corpora.py ${vars.dataset} ${vars.use_filter_terms}"
+      - "python ./scripts/compile_corpora.py ${vars.dataset} ${vars.filter}"
     deps:
       - "assets/${vars.dataset}/entities.pkl"
       - "assets/${vars.dataset}/entities_failed_lookups.pkl"
diff --git a/benchmarks/nel/scripts/candidate_generation/lexical.py b/benchmarks/nel/scripts/candidate_generation/lexical.py
index be3100b3c..6c8ae8c64 100644
--- a/benchmarks/nel/scripts/candidate_generation/lexical.py
+++ b/benchmarks/nel/scripts/candidate_generation/lexical.py
@@ -21,8 +21,17 @@ def _fetch_candidates(
         max_n_candidates: int,
         similarity_cutoff: float = 0.5,
     ) -> Iterable[int]:
+        # todo 1. replace all wiki parsing stuff with wikid
+        # todo 2. review generation of KB, move to wikid
+        # todo 3. get rid of entity pickle files (move loading, stats to compile_corpora)
+        # todo 4. re-evaluate efficacy of fuzzy string lookup (memory? access time?)
+        # todo also: push forward spacy NEL changes - add mechanism for pushing back entity sets instead of single
+        #  entities - how?
+
+        hits = self._lookup_struct.get(span.text, [])
         all_cands = [
             kb.get_alias_candidates(entry[1]) for entry in self._lookup_struct.get(span.text, [])
             if entry[0] >= similarity_cutoff
         ][:max_n_candidates]
+        x = 3
         return {cand for cands_for_alias in all_cands for cand in cands_for_alias}
diff --git a/benchmarks/nel/scripts/compile_corpora.py b/benchmarks/nel/scripts/compile_corpora.py
index 28e154f99..bc477f6e8 100644
--- a/benchmarks/nel/scripts/compile_corpora.py
+++ b/benchmarks/nel/scripts/compile_corpora.py
@@ -5,7 +5,7 @@
 from utils import read_filter_terms
 
 
-def main(dataset_name: str, use_filter_terms: bool = typer.Option(False, "--use_filter_terms")):
+def main(dataset_name: str, use_filter_terms: bool = typer.Option(False, "--filter")):
     """Create corpora in spaCy format.
     dataset_name (str): Dataset name.
     use_filter_terms (bool): Whether to use the filter terms defined in the dataset config. If True, only documents
diff --git a/benchmarks/nel/scripts/evaluate.py b/benchmarks/nel/scripts/evaluate.py
index e8ee7db68..e56b4c2da 100644
--- a/benchmarks/nel/scripts/evaluate.py
+++ b/benchmarks/nel/scripts/evaluate.py
@@ -13,4 +13,5 @@ def main(dataset_name: str, run_name: str):
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    main("mewsli_9", "cg-embedding")
+    # typer.run(main)
diff --git a/benchmarks/nel/scripts/extract_demo_wiki_dump.py b/benchmarks/nel/scripts/extract_demo_wiki_dump.py
deleted file mode 100644
index 6ebaec08f..000000000
--- a/benchmarks/nel/scripts/extract_demo_wiki_dump.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Extract demo set from Wiki dumps."""
-from utils import read_filter_terms
-from wiki import wiki_dump_api
-
-if __name__ == '__main__':
-    wiki_dump_api.extract_demo_dump(read_filter_terms())
diff --git a/benchmarks/nel/scripts/parse_wiki_dumps.py b/benchmarks/nel/scripts/parse_wiki_dumps.py
deleted file mode 100644
index 8108400bf..000000000
--- a/benchmarks/nel/scripts/parse_wiki_dumps.py
+++ /dev/null
@@ -1,35 +0,0 @@
-""" Parsing of Wiki dump and persisting of parsing results to DB. """
-from typing import Optional
-import typer
-from wiki import wiki_dump_api
-
-
-def main(
-    entity_limit: Optional[int] = typer.Option(None, "--entity_limit"),
-    article_limit: Optional[int] = typer.Option(None, "--article_limit"),
-    alias_limit: Optional[int] = typer.Option(None, "--alias_limit"),
-    use_filtered_dumps: bool = typer.Option(False, "--use_filtered_dumps"),
-):
-    """Parses Wikidata and Wikipedia dumps. Persists parsing results to DB.
-    entity_limit (Optional[int]): Max. number of entities to parse. Unlimited if None.
-    article_limit (Optional[int]): Max. number of entities to parse. Unlimited if None.
-    alias_limit (Optional[int]): Max. number of entity aliases to parse. Unlimited if None.
-    use_filtered_dumps (bool): Whether to use filtered Wiki dumps instead of the full ones.
-    """
-
-    wiki_dump_api.parse(
-        entity_config={"limit": entity_limit},
-        article_text_config={"limit": article_limit},
-        alias_prior_prob_config={"limit": alias_limit},
-        use_filtered_dumps=use_filtered_dumps
-    )
-
-
-if __name__ == "__main__":
-    # typer.run(main)
-    wiki_dump_api.parse(
-        entity_config={"limit": None},
-        article_text_config={"limit": None},
-        alias_prior_prob_config={"limit": None},
-        use_filtered_dumps=False
-    )
diff --git a/benchmarks/nel/scripts/wiki/__init__.py b/benchmarks/nel/scripts/wiki/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmarks/nel/scripts/wiki/ddl.sql b/benchmarks/nel/scripts/wiki/ddl.sql
deleted file mode 100644
index 0be716164..000000000
--- a/benchmarks/nel/scripts/wiki/ddl.sql
+++ /dev/null
@@ -1,55 +0,0 @@
--- DDL for parsed Wiki data.
-
-CREATE TABLE entities (
-    id TEXT PRIMARY KEY NOT NULL,
-    -- This could be normalized. Not worth it at the moment though, since they aren't used.
-    claims TEXT
-);
-
--- The FTS5 virtual table implementation doesn't allow for indices, so we rely on ROWID to match entities.
--- This isn't great, but in a controlled setup this allows for stable matching.
--- Same for foreign keys.
-CREATE VIRTUAL TABLE entities_texts USING fts5(
-    entity_id UNINDEXED,
-    name,
-    description,
-    label
-);
-
-CREATE TABLE articles (
-    entity_id TEXT PRIMARY KEY NOT NULL,
-    id TEXT NOT NULL,
-    FOREIGN KEY(entity_id) REFERENCES entities(id)
-);
-CREATE UNIQUE INDEX idx_articles_id
-ON articles (id);
-
--- Same here: no indices possible, relying on ROWID to match with articles.
-CREATE VIRTUAL TABLE articles_texts USING fts5(
-    entity_id UNINDEXED,
-    title,
-    content
-);
-
-CREATE TABLE properties_in_entities (
-    property_id TEXT NOT NULL,
-    from_entity_id TEXT NOT NULL,
-    to_entity_id TEXT NOT NULL,
-    PRIMARY KEY (property_id, from_entity_id, to_entity_id),
-    FOREIGN KEY(from_entity_id) REFERENCES entities(id),
-    FOREIGN KEY(to_entity_id) REFERENCES entities(id)
-);
-CREATE INDEX idx_properties_in_entities
-ON properties_in_entities (property_id);
-
-CREATE TABLE aliases_for_entities (
-    alias TEXT NOT NULL,
-    entity_id TEXT NOT NULL,
-    count INTEGER,
-    PRIMARY KEY (alias, entity_id),
-    FOREIGN KEY(entity_id) REFERENCES entities(id)
-);
-CREATE INDEX idx_aliases_for_entities_alias
-ON aliases_for_entities (alias);
-CREATE INDEX idx_aliases_for_entities_entity_id
-ON aliases_for_entities (entity_id);
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/wiki/download.sh b/benchmarks/nel/scripts/wiki/download.sh
deleted file mode 100644
index a121e0687..000000000
--- a/benchmarks/nel/scripts/wiki/download.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Utility for robustly downloading large files (i.e. retrying on dropped connections).
-# Source: https://superuser.com/a/689340
-
-while [ 1 ]; do
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 0 --continue $1
-if [ $? = 0 ]; then break; fi; # check return value, break if successful (0)
-sleep 1s;
-done
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/wiki/namespaces.py b/benchmarks/nel/scripts/wiki/namespaces.py
deleted file mode 100644
index a04900066..000000000
--- a/benchmarks/nel/scripts/wiki/namespaces.py
+++ /dev/null
@@ -1,129 +0,0 @@
-""" Information on Wiki namespaces.
-Source: https://github.com/explosion/projects/blob/master/nel-wikipedia/wiki_namespaces.py.
-"""
-
-# List of meta pages in Wikidata, should be kept out of the Knowledge base
-WD_META_ITEMS = [
-    "Q163875",
-    "Q191780",
-    "Q224414",
-    "Q4167836",
-    "Q4167410",
-    "Q4663903",
-    "Q11266439",
-    "Q13406463",
-    "Q15407973",
-    "Q18616576",
-    "Q19887878",
-    "Q22808320",
-    "Q23894233",
-    "Q33120876",
-    "Q42104522",
-    "Q47460393",
-    "Q64875536",
-    "Q66480449",
-]
-
-
-# TODO: add more cases from non-English WP's
-
-# List of prefixes that refer to Wikipedia "file" pages
-WP_FILE_NAMESPACE = ["Bestand", "File"]
-
-# List of prefixes that refer to Wikipedia "category" pages
-WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
-
-# List of prefixes that refer to Wikipedia "meta" pages
-# these will/should be matched ignoring case
-WP_META_NAMESPACE = (
-    WP_FILE_NAMESPACE
-    + WP_CATEGORY_NAMESPACE
-    + [
-        "b",
-        "betawikiversity",
-        "Book",
-        "c",
-        "Commons",
-        "d",
-        "dbdump",
-        "download",
-        "Draft",
-        "Education",
-        "Foundation",
-        "Gadget",
-        "Gadget definition",
-        "Gebruiker",
-        "gerrit",
-        "Help",
-        "Image",
-        "Incubator",
-        "m",
-        "mail",
-        "mailarchive",
-        "media",
-        "MediaWiki",
-        "MediaWiki talk",
-        "Mediawikiwiki",
-        "MediaZilla",
-        "Meta",
-        "Metawikipedia",
-        "Module",
-        "mw",
-        "n",
-        "nost",
-        "oldwikisource",
-        "otrs",
-        "OTRSwiki",
-        "Overleg gebruiker",
-        "outreach",
-        "outreachwiki",
-        "Portal",
-        "phab",
-        "Phabricator",
-        "Project",
-        "q",
-        "quality",
-        "rev",
-        "s",
-        "spcom",
-        "Special",
-        "species",
-        "Strategy",
-        "sulutil",
-        "svn",
-        "Talk",
-        "Template",
-        "Template talk",
-        "Testwiki",
-        "ticket",
-        "TimedText",
-        "Toollabs",
-        "tools",
-        "tswiki",
-        "User",
-        "User talk",
-        "v",
-        "voy",
-        "w",
-        "Wikibooks",
-        "Wikidata",
-        "wikiHow",
-        "Wikinvest",
-        "wikilivres",
-        "Wikimedia",
-        "Wikinews",
-        "Wikipedia",
-        "Wikipedia talk",
-        "Wikiquote",
-        "Wikisource",
-        "Wikispecies",
-        "Wikitech",
-        "Wikiversity",
-        "Wikivoyage",
-        "wikt",
-        "wiktionary",
-        "wmf",
-        "wmania",
-        "WP",
-    ]
-)
diff --git a/benchmarks/nel/scripts/wiki/wiki_dump_api.py b/benchmarks/nel/scripts/wiki/wiki_dump_api.py
deleted file mode 100644
index 077f5f0b1..000000000
--- a/benchmarks/nel/scripts/wiki/wiki_dump_api.py
+++ /dev/null
@@ -1,199 +0,0 @@
-""" Wiki dataset for unified access to information from Wikipedia and Wikidata dumps. """
-import os.path
-import pickle
-from pathlib import Path
-from typing import Dict, Optional, Any, Tuple, List, Set
-import pysqlite3 as sqlite3
-
-from schemas import Entity
-from wiki import wikidata, wikipedia
-
-_assets_dir = Path(os.path.abspath(__file__)).parent.parent.parent / "assets" / "wiki"
-_paths = {
-    "db": _assets_dir / "wiki.sqlite3",
-    "wikidata_dump": _assets_dir / "wikidata_entity_dump.json.bz2",
-    "wikipedia_dump": _assets_dir / "wikipedia_dump.xml.bz2",
-    "filtered_wikidata_dump": _assets_dir / "wikidata_entity_dump_filtered.json.bz2",
-    "filtered_wikipedia_dump": _assets_dir / "wikipedia_dump_filtered.xml.bz2",
-    "filtered_entity_entity_info": _assets_dir / "wiki_entity_info_filtered.pkl"
-}
-
-
-def establish_db_connection() -> sqlite3.Connection:
-    """Estabished database connection.
-    RETURNS (sqlite3.Connection): Database connection.
-    """
-    db_conn = sqlite3.connect(_paths["db"])
-    db_conn.row_factory = sqlite3.Row
-    return db_conn
-
-
-def extract_demo_dump(filter_terms: Set[str]) -> None:
-    """Extracts small demo dump by parsing the Wiki dumps and keeping only those entities (and their articles)
-    containing any of the specified filter_terms. The retained entities and articles are written into
-    filter_terms (Set[str]): Terms having to appear in entity descriptions in order to be wrr
-    """
-
-    entity_ids, entity_labels = wikidata.extract_demo_dump(
-        _paths["wikidata_dump"], _paths["filtered_wikidata_dump"], filter_terms
-    )
-    with open(_paths["filtered_entity_entity_info"], "wb") as file:
-        pickle.dump((entity_ids, entity_labels), file)
-
-    with open(_paths["filtered_entity_entity_info"], "rb") as file:
-        _, entity_labels = pickle.load(file)
-    wikipedia.extract_demo_dump(_paths["wikipedia_dump"], _paths["filtered_wikipedia_dump"], entity_labels)
-
-
-def parse(
-    db_conn: Optional[sqlite3.Connection] = None,
-    entity_config: Optional[Dict[str, Any]] = None,
-    article_text_config: Optional[Dict[str, Any]] = None,
-    alias_prior_prob_config: Optional[Dict[str, Any]] = None,
-    use_filtered_dumps: bool = False
-) -> None:
-    """Parses Wikipedia and Wikidata dumps. Writes parsing results to a database. Note that this takes hours.
-    db_conn (Optional[sqlite3.Connection]): Database connection.
-    entity_config (Dict[str, Any]): Arguments to be passed on to wikidata.read_entities().
-    article_text_config (Dict[str, Any]): Arguments to be passed on to wikipedia.read_text().
-    alias_prior_prob_config (Dict[str, Any]): Arguments to be passed on to wikipedia.read_prior_probs().
-    use_filtered_dumps (bool): Whether to use small, filtered Wiki dumps.
-    """
-
-    msg = "Database exists already. Execute `spacy project run delete_wiki_db` to remove it."
-    # assert not os.path.exists(_paths["db"]), msg
-
-    db_conn = db_conn if db_conn else establish_db_connection()
-    with open(Path(os.path.abspath(__file__)).parent / "ddl.sql", "r") as ddl_sql:
-        db_conn.cursor().executescript(ddl_sql.read())
-
-    wikidata.read_entities(
-        _paths["wikidata_dump"] if not use_filtered_dumps else _paths["filtered_wikidata_dump"],
-        db_conn,
-        **(entity_config if entity_config else {}),
-    )
-
-    # wikipedia.read_prior_probs(
-    #     _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
-    #     db_conn,
-    #     **(alias_prior_prob_config if alias_prior_prob_config else {}),
-    # )
-    #
-    # wikipedia.read_texts(
-    #     _paths["wikipedia_dump"] if not use_filtered_dumps else _paths["filtered_wikipedia_dump"],
-    #     db_conn,
-    #     **(article_text_config if article_text_config else {}),
-    # )
-
-
-def load_entities(
-    values: Tuple[str, ...], db_conn: Optional[sqlite3.Connection] = None
-) -> Dict[str, Entity]:
-    """Loads information for entity or entities by querying information from DB.
-    Note that this doesn't return all available information, only the part used in the current benchmark solution.
-    values (Tuple[str]): Values for key to look up.
-    db_conn (Optional[sqlite3.Connection]): Database connection.
-    RETURNS (Dict[str, Entity]): Information on requested entities.
-    """
-    db_conn = db_conn if db_conn else establish_db_connection()
-
-    return {
-        rec["id"]: Entity(
-            qid=rec["id"],
-            name=rec["entity_title"],
-            aliases={
-                alias
-                for alias in {
-                    rec["entity_title"],
-                    rec["article_title"],
-                    rec["label"],
-                    *(rec["aliases"] if rec["aliases"] else "").split(","),
-                }
-                if alias
-            },
-            article_title=rec["article_title"],
-            article_text=rec["text"],
-            description=rec["description"],
-            count=rec["count"] if rec["count"] else 0,
-        )
-        for rec in db_conn.cursor().execute(
-            f"""
-                SELECT 
-                    e.id,
-                    et.name as entity_title,
-                    et.description,
-                    et.label,
-                    at.title as article_title,
-                    at.content,
-                    GROUP_CONCAT(afe.alias) as aliases,
-                    SUM(afe.count) as count
-                FROM 
-                    entities e
-                LEFT JOIN entities_texts et on
-                    et.ROWID = e.ROWID
-                LEFT JOIN articles a on
-                    a.entity_id = e.id
-                LEFT JOIN articles_texts at on
-                    at.ROWID = a.ROWID
-                LEFT JOIN aliases_for_entities afe on
-                    afe.entity_id = e.id                                         
-                WHERE 
-                    e.id IN (%s)
-                GROUP BY
-                    e.id,
-                    et.name,
-                    et.description,
-                    et.label,
-                    at.title,
-                    at.content
-            """
-            % ",".join("?" * len(values)),
-            tuple(set(values)),
-        )
-    }
-
-
-def load_alias_entity_prior_probabilities(
-    entity_ids: Set[str], db_conn: Optional[sqlite3.Connection] = None
-) -> Dict[str, List[Tuple[str, float]]]:
-    """Loads alias-entity counts from database and transforms them into prior probabilities per alias.
-    entity_ids (Set[str]): Set of entity IDs to allow.
-    RETURN (Dict[str, Tuple[Tuple[str, ...], Tuple[float, ...]]]): Mapping of alias to tuples of entities and the
-        corresponding prior probabilities.
-    """
-
-    db_conn = db_conn if db_conn else establish_db_connection()
-
-    alias_entity_prior_probs = {
-        rec["alias"]: [
-            (entity_id, int(count))
-            for entity_id, count in zip(
-                rec["entity_ids"].split(","), rec["counts"].split(",")
-            )
-        ]
-        for rec in db_conn.cursor().execute(
-            """
-                SELECT 
-                    alias,
-                    GROUP_CONCAT(entity_id) as entity_ids,
-                    GROUP_CONCAT(count) as counts
-                FROM 
-                    aliases_for_entities                                   
-                WHERE 
-                    entity_id IN (%s)
-                GROUP BY
-                    alias
-            """
-            % ",".join("?" * len(entity_ids)),
-            tuple(entity_ids),
-        )
-    }
-
-    for alias, entity_counts in alias_entity_prior_probs.items():
-        total_count = sum([ec[1] for ec in entity_counts])
-        alias_entity_prior_probs[alias] = [
-            (ec[0], ec[1] / max(total_count, 1)) for ec in entity_counts
-        ]
-
-    return alias_entity_prior_probs
-
diff --git a/benchmarks/nel/scripts/wiki/wikidata.py b/benchmarks/nel/scripts/wiki/wikidata.py
deleted file mode 100644
index 9a40e57c2..000000000
--- a/benchmarks/nel/scripts/wiki/wikidata.py
+++ /dev/null
@@ -1,313 +0,0 @@
-""" Functionalities for processing Wikidata dump.
-Modified from https://github.com/explosion/projects/blob/master/nel-wikipedia/wikidata_processor.py.
-"""
-
-import bz2
-import io
-import json
-import pysqlite3 as sqlite3
-from pathlib import Path
-from typing import Union, Optional, Dict, Tuple, Any, List, Set, Iterator
-
-import tqdm
-
-from wiki.namespaces import WD_META_ITEMS
-
-
-def chunked_readlines(
-    f: bz2.BZ2File, chunk_size: int = 1024 * 1024 * 32
-) -> Iterator[bytes]:
-    """Reads lines from compressed BZ2 file in chunks. Source: https://stackoverflow.com/a/65765814.
-    chunk_size (int): Chunk size in bytes.
-    RETURNS (Iterator[bytes]): Read bytes.
-    """
-    s = io.BytesIO()
-    while True:
-        buf = f.read(chunk_size)
-        if not buf:
-            return s.getvalue()
-        s.write(buf)
-        s.seek(0)
-        l = s.readlines()
-        yield from l[:-1]
-        s = io.BytesIO()
-        # very important: the last line read in the 1 MB chunk might be
-        # incomplete, so we keep it to be processed in the next iteration
-        # check if this is ok if f.read() stopped in the middle of a \r\n?
-        s.write(l[-1])
-
-
-def read_entities(
-    wikidata_file: Union[str, Path],
-    db_conn: sqlite3.Connection,
-    batch_size: int = 5000,
-    limit: Optional[int] = None,
-    lang: str = "en",
-    parse_descr: bool = True,
-    parse_properties: bool = True,
-    parse_sitelinks: bool = True,
-    parse_labels: bool = True,
-    parse_aliases: bool = True,
-    parse_claims: bool = True,
-) -> None:
-    """Reads entity information from wikidata dump.
-    wikidata_file (Union[str, Path]): Path of wikidata dump file.
-    db_conn (sqlite3.Connection): DB connection.
-    batch_size (int): Batch size for DB commits.
-    limit (Optional[int]): Max. number of entities to parse.
-    to_print (bool): Whether to print information during the parsing process.
-    lang (str): Language with which to filter entity information.
-    parse_descr (bool): Whether to parse entity descriptions.
-    parse_properties (bool): Whether to parse entity properties.
-    parse_sitelinks (bool): Whether to parse entity sitelinks.
-    parse_labels (bool): Whether to parse entity labels.
-    parse_aliases (bool): Whether to parse entity aliases.
-    parse_claims (bool): Whether to parse entity claims.
-    """
-
-    # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
-    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-
-    site_filter = "{}wiki".format(lang)
-
-    # filter: currently defined as OR: one hit suffices to be removed from further processing
-    exclude_list = WD_META_ITEMS
-
-    # punctuation
-    exclude_list.extend(["Q1383557", "Q10617810"])
-
-    # letters etc
-    exclude_list.extend(
-        ["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"]
-    )
-
-    neg_prop_filter = {
-        "P31": exclude_list,  # instance of
-        "P279": exclude_list,  # subclass
-    }
-
-    entity_ids_in_db: Set[str] = {
-        rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM entities")
-    }
-    title_to_id: Dict[str, str] = {}
-    id_to_attrs: Dict[str, Dict[str, Any]] = {}
-
-    with bz2.open(wikidata_file, mode="rb") as file:
-        pbar_params = {"total": limit} if limit else {}
-
-        with tqdm.tqdm(
-            desc="Parsing entity data", leave=True, miniters=1000, **pbar_params
-        ) as pbar:
-            for cnt, line in enumerate(file):
-                pbar.update(1)
-                if limit and cnt >= limit:
-                    break
-
-                clean_line = line.strip()
-                if clean_line.endswith(b","):
-                    clean_line = clean_line[:-1]
-
-                if len(clean_line) > 1:
-                    obj = json.loads(clean_line)
-                    if obj.get("id") in entity_ids_in_db:
-                        continue
-                    entry_type = obj["type"]
-
-                    if entry_type == "item":
-                        keep = True
-
-                        claims = obj["claims"]
-                        filtered_claims: List[Dict[str, str]] = []
-                        if parse_claims:
-                            for prop, value_set in neg_prop_filter.items():
-                                claim_property = claims.get(prop, None)
-                                if claim_property:
-                                    filtered_claims.append(claim_property)
-                                    for cp in claim_property:
-                                        cp_id = (
-                                            cp["mainsnak"]
-                                            .get("datavalue", {})
-                                            .get("value", {})
-                                            .get("id")
-                                        )
-                                        cp_rank = cp["rank"]
-                                        if (
-                                            cp_rank != "deprecated"
-                                            and cp_id in value_set
-                                        ):
-                                            keep = False
-
-                        if keep:
-                            unique_id = obj["id"]
-                            if unique_id not in id_to_attrs:
-                                id_to_attrs[unique_id] = {}
-                            if parse_claims:
-                                id_to_attrs[unique_id]["claims"] = filtered_claims
-
-                            # parsing all properties that refer to other entities
-                            if parse_properties:
-                                id_to_attrs[unique_id]["properties"] = []
-                                for prop, claim_property in claims.items():
-                                    cp_dicts = [
-                                        cp["mainsnak"]["datavalue"].get("value")
-                                        for cp in claim_property
-                                        if cp["mainsnak"].get("datavalue")
-                                    ]
-                                    cp_values = [
-                                        cp_dict.get("id")
-                                        for cp_dict in cp_dicts
-                                        if isinstance(cp_dict, dict)
-                                        if cp_dict.get("id") is not None
-                                    ]
-                                    if cp_values:
-                                        id_to_attrs[unique_id]["properties"].append(
-                                            (prop, cp_values)
-                                        )
-
-                            found_link = False
-                            if parse_sitelinks:
-                                site_value = obj["sitelinks"].get(site_filter, None)
-                                if site_value:
-                                    site = site_value["title"]
-                                    title_to_id[site] = unique_id
-                                    found_link = True
-                                    id_to_attrs[unique_id]["sitelinks"] = site_value
-
-                            if parse_labels:
-                                labels = obj["labels"]
-                                if labels:
-                                    lang_label = labels.get(lang, None)
-                                    if lang_label:
-                                        id_to_attrs[unique_id]["labels"] = lang_label
-
-                            if found_link and parse_descr:
-                                descriptions = obj["descriptions"]
-                                if descriptions:
-                                    lang_descr = descriptions.get(lang, None)
-                                    if lang_descr:
-                                        id_to_attrs[unique_id][
-                                            "description"
-                                        ] = lang_descr["value"]
-
-                            if parse_aliases:
-                                id_to_attrs[unique_id]["aliases"] = []
-                                aliases = obj["aliases"]
-                                if aliases:
-                                    lang_aliases = aliases.get(lang, None)
-                                    if lang_aliases:
-                                        for item in lang_aliases:
-                                            id_to_attrs[unique_id]["aliases"].append(
-                                                item["value"]
-                                            )
-
-                pbar.update(1)
-
-                # Save batch.
-                if pbar.n % batch_size == 0:
-                    _write_to_db(db_conn, title_to_id, id_to_attrs)
-                    title_to_id = {}
-                    id_to_attrs = {}
-
-    if pbar.n % batch_size != 0:
-        _write_to_db(db_conn, title_to_id, id_to_attrs)
-
-
-def _write_to_db(
-    db_conn: sqlite3.Connection,
-    title_to_id: Dict[str, str],
-    id_to_attrs: Dict[str, Dict[str, Any]],
-) -> None:
-    """Persists entity information to database.
-    db_conn (Connection): Database connection.
-    title_to_id (Dict[str, str]): Titles to QIDs.
-    id_to_attrs (Dict[str, Dict[str, Any]]): For QID a dictionary with property name to property value(s).
-    """
-
-    entities: List[Tuple[Optional[str], ...]] = []
-    entities_texts: List[Tuple[Optional[str], ...]] = []
-    props_in_ents: Set[Tuple[str, str, str]] = set()
-    aliases_for_entities: List[Tuple[str, str, int]] = []
-
-    for title, qid in title_to_id.items():
-        entities.append((qid, json.dumps(id_to_attrs[qid]["claims"])))
-        entities_texts.append(
-            (
-                qid,
-                title,
-                id_to_attrs[qid].get("description", None),
-                id_to_attrs[qid].get("labels", {}).get("value", None),
-            )
-        )
-        for alias in id_to_attrs[qid]["aliases"]:
-            aliases_for_entities.append((alias, qid, 1))
-
-        for prop in id_to_attrs[qid]["properties"]:
-            for second_qid in prop[1]:
-                props_in_ents.add((prop[0], qid, second_qid))
-
-    cur = db_conn.cursor()
-    cur.executemany(
-        "INSERT INTO entities (id, claims) VALUES (?, ?)",
-        entities,
-    )
-    cur.executemany(
-        "INSERT INTO entities_texts (entity_id, name, description, label) VALUES (?, ?, ?, ?)",
-        entities_texts,
-    )
-    cur.executemany(
-        "INSERT INTO properties_in_entities (property_id, from_entity_id, to_entity_id) VALUES (?, ?, ?)",
-        props_in_ents,
-    )
-    cur.executemany(
-        """
-        INSERT INTO aliases_for_entities (alias, entity_id, count) VALUES (?, ?, ?)
-        ON CONFLICT (alias, entity_id) DO UPDATE SET
-            count=count + excluded.count 
-        """,
-        aliases_for_entities,
-    )
-    db_conn.commit()
-
-
-def extract_demo_dump(in_dump_path: Path, out_dump_path: Path, filter_terms: Set[str]) -> Tuple[Set[str], Set[str]]:
-    """Writes information on those entities having at least one of the filter_terms in their description to a new dump
-    at location filtered_dump_path.
-    in_dump_path (Path): Path to complete Wikidata dump.
-    out_dump_path (Path): Path to filtered Wikidata dump.
-    filter_terms (Set[str]): Terms having to appear in entity descriptions in order to be included in output dump.
-    RETURNS (Tuple[Set[str], Set[str]]): For retained entities: (1) set of QIDs, (2) set of labels (should match article
-        titles).
-    """
-
-    entity_ids: Set[str] = set()
-    entity_labels: Set[str] = set()
-    filter_terms = {ft.lower() for ft in filter_terms}
-
-    with bz2.open(in_dump_path, mode="rb") as in_file:
-        with bz2.open(out_dump_path, mode="wb") as out_file:
-            write_count = 0
-            with tqdm.tqdm(
-                desc="Filtering entity data", leave=True, miniters=100
-            ) as pbar:
-                for cnt, line in enumerate(in_file):
-                    keep = cnt == 0
-
-                    if not keep:
-                        clean_line = line.strip()
-                        if clean_line.endswith(b","):
-                            clean_line = clean_line[:-1]
-                        if len(clean_line) > 1:
-                            keep = any([ft in clean_line.decode("utf-8").lower() for ft in filter_terms])
-                            if keep:
-                                obj = json.loads(clean_line)
-                                label = obj["labels"].get("en", {}).get("value", "")
-                                entity_ids.add(obj["id"])
-                                entity_labels.add(label)
-
-                    if keep:
-                        out_file.write(line)
-                        write_count += 1
-
-                    pbar.update(1)
-
-    return entity_ids, entity_labels
diff --git a/benchmarks/nel/scripts/wiki/wikipedia.py b/benchmarks/nel/scripts/wiki/wikipedia.py
deleted file mode 100644
index a32f6a5cb..000000000
--- a/benchmarks/nel/scripts/wiki/wikipedia.py
+++ /dev/null
@@ -1,656 +0,0 @@
-""" Functionalities for processing Wikipedia dump.
-Modified from https://github.com/explosion/projects/blob/master/nel-wikipedia/wikipedia_processor.py.
-"""
-
-import re
-import bz2
-import pysqlite3 as sqlite3
-
-from pathlib import Path
-from typing import Union, Optional, Tuple, List, Dict, Set, Any
-
-import tqdm
-
-from wiki.namespaces import (
-    WP_META_NAMESPACE,
-    WP_FILE_NAMESPACE,
-    WP_CATEGORY_NAMESPACE,
-)
-
-"""
-Process a Wikipedia dump to calculate entity_title frequencies and prior probabilities in combination with certain mentions.
-Write these results to file for downstream KB and training data generation.
-
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-"""
-
-map_alias_to_link = dict()
-
-title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
-id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
-text_tag_regex = re.compile(r"(?<=<text).*?(?=>)")
-text_regex = re.compile(r"(?<=<text>).*(?=</text)")
-info_regex = re.compile(r"{[^{]*?}")
-html_regex = re.compile(r"&lt;!--[^-]*--&gt;")
-ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
-ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy
-
-# find the links
-link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
-
-# match on interwiki links, e.g. `en:` or `:fr:`
-ns_regex = r":?" + "[a-z][a-z]" + ":"
-# match on Namespace: optionally preceded by a :
-for ns in WP_META_NAMESPACE:
-    ns_regex += "|" + ":?" + ns + ":"
-ns_regex = re.compile(ns_regex, re.IGNORECASE)
-
-files = r""
-for f in WP_FILE_NAMESPACE:
-    files += "\[\[" + f + ":[^[\]]+]]" + "|"
-files = files[0 : len(files) - 1]
-file_regex = re.compile(files)
-
-cats = r""
-for c in WP_CATEGORY_NAMESPACE:
-    cats += "\[\[" + c + ":[^\[]*]]" + "|"
-cats = cats[0 : len(cats) - 1]
-category_regex = re.compile(cats)
-
-
-def read_prior_probs(
-    wikidata_input_path: Union[str, Path],
-    db_conn: sqlite3.Connection,
-    batch_size: int = 5000,
-    limit: Optional[int] = None,
-) -> None:
-    """
-    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
-    The full file takes about 2-3h to parse 1100M lines. Writes prior information to DB.
-    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
-    wikidata_input_path (Union[str, Path]): Path to Wikipedia dump.
-    batch_size (int): DB batch size.
-    db_conn (sqlite3.Connection): Database connection.
-    n_article_limit (Optional[int]): Number of articles/entities to process.
-    """
-
-    read_id = False
-    current_article_id = None
-    entity_title_to_id = {
-        row["name"]: row["id"]
-        for row in db_conn.execute("SELECT name, id FROM entities")
-    }
-
-    def write_to_db(_aliases_for_entities) -> None:
-        """Writes record triples to DB.
-        __aliases_for_entities (): alias-entity-frequency triples.
-        """
-        db_conn.cursor().executemany(
-            """
-            INSERT INTO aliases_for_entities (alias, entity_id, count) VALUES (?, ?, ?)
-            ON CONFLICT (alias, entity_id) DO UPDATE SET
-                count=count + excluded.count
-            """,
-            _aliases_for_entities,
-        )
-        db_conn.commit()
-
-    with bz2.open(wikidata_input_path, mode="rb") as file:
-        pbar_params = {"total": limit} if limit else {}
-        with tqdm.tqdm(
-            desc="Parsing alias-entity prior probabilities", **pbar_params
-        ) as pbar:
-            line = file.readline()
-            while line and (not limit or pbar.n < limit):
-                clean_line = line.strip().decode("utf-8")
-
-                # we attempt at reading the article's ID (but not the revision or contributor ID)
-                if "<revision>" in clean_line or "<contributor>" in clean_line:
-                    read_id = False
-                if "<page>" in clean_line:
-                    read_id = True
-
-                if read_id:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        current_article_id = ids[0]
-
-                # only processing prior probabilities from true training (non-dev) articles
-                if not is_dev(current_article_id):
-                    aliases, entities, normalizations = _get_wp_links(clean_line)
-                    for alias, entity_title, norm in zip(
-                        aliases, entities, normalizations
-                    ):
-                        _store_alias(
-                            alias,
-                            entity_title,
-                            normalize_alias=norm,
-                            normalize_entity=True,
-                        )
-
-                line = file.readline()
-                pbar.update(1)
-
-    # write all aliases and their entities and count occurrences to file
-    with tqdm.tqdm(
-        desc="Persisting alias-entity prior probabilities", total=len(map_alias_to_link)
-    ) as pbar:
-        aliases_for_entities: List[Tuple[str, str, int]] = []
-        for alias, alias_dict in map_alias_to_link.items():
-            for entity_title, count in alias_dict.items():
-                if entity_title in entity_title_to_id:
-                    aliases_for_entities.append(
-                        (alias, entity_title_to_id[entity_title], count)
-                    )
-            if pbar.n % batch_size == 0:
-                write_to_db(aliases_for_entities)
-                aliases_for_entities = []
-
-            pbar.update(1)
-
-        if pbar.n % batch_size != 0:
-            write_to_db(aliases_for_entities)
-
-
-def _store_alias(
-    alias: str,
-    entity_title: str,
-    normalize_alias: bool = False,
-    normalize_entity: bool = True,
-) -> None:
-    """Stores (normalized) alias for (normalized) entity_title ID in mapping dictionaries.
-    alias (str): Alias text.
-    entity_title (str): Entity title.
-    normalize_alias (bool): Whether to normalize the alias text, i.e. remove anchors.
-    normalize_entity (bool): Whether to normalize the entity title.
-    """
-    alias = alias.strip()
-    entity_title = entity_title.strip()
-
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
-    if normalize_entity:
-        # wikipedia titles are always capitalized
-        entity_title = _capitalize_first(entity_title.split("#")[0])
-    if normalize_alias:
-        alias = alias.split("#")[0]
-
-    if alias and entity_title:
-        alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(entity_title, 0)
-        alias_dict[entity_title] = entity_count + 1
-        map_alias_to_link[alias] = alias_dict
-
-
-def _get_wp_links(text: str) -> Tuple[List[str], List[str], List[bool]]:
-    """Retrieve interwiki links from text.
-    text (str): Text to parse.
-    RETURNS (Tuple[List[str], List[str], List[bool]]): List of aliases, entity titles, and whether normalization they
-        were normalized.
-    """
-    aliases: List[str] = []
-    entities: List[str] = []
-    normalizations: List[bool] = []
-
-    matches = link_regex.findall(text)
-    for match in matches:
-        match = match[2:][:-2].replace("_", " ").strip()
-
-        if ns_regex.match(match):
-            pass  # ignore the entity_title if it points to a "meta" page
-
-        # this is a simple [[link]], with the alias the same as the mention
-        elif "|" not in match:
-            aliases.append(match)
-            entities.append(match)
-            normalizations.append(True)
-
-        # in wiki format, the link is written as [[entity_title|alias]]
-        else:
-            splits = match.split("|")
-            entity = splits[0].strip()
-            alias = splits[1].strip()
-            # specific wiki format  [[alias (specification)|]]
-            if len(alias) == 0 and "(" in entity:
-                alias = entity.split("(")[0]
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-            else:
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-
-    return aliases, entities, normalizations
-
-
-def _capitalize_first(text: str) -> Optional[str]:
-    """Capitalize first character.
-    text (str): String in which to capitalize first character.
-    RETURN (Optional[str]): Text with first character capitalized.
-    """
-    if not text:
-        return None
-    result = text[0].capitalize()
-    if len(result) > 0:
-        result += text[1:]
-    return result
-
-
-def read_texts(
-    wikipedia_input_path: Union[str, Path],
-    db_conn: sqlite3.Connection,
-    batch_size: int = 5000,
-    limit: Optional[int] = None,
-    n_char_limit: int = 1000,
-) -> None:
-    """
-    Read the XML Wikipedia data to parse out clean article texts. Texts are stored in file.
-    wikipedia_input_path (Union[str, Path]): Path to Wikipedia dump.
-    db_conn (sqlite3.Connection): DB connection.
-    limit (Optional[int]): Max. number of articles to process. If None, all are processed.
-    n_char_limit (Optional[int]): Max. number of characters to process per article.
-    """
-    read_ids: Set[str] = set()
-    entity_title_to_id = {
-        row["name"]: row["id"]
-        for row in db_conn.execute("SELECT name, id FROM entities")
-    }
-    # records: List[Tuple[str, str, str, str]] = []
-    article_records: List[Tuple[str, str]] = []
-    article_texts_records: List[Tuple[str, str, str]] = []
-    # Fetch IDs of entities whose articles are already in the DB.
-    article_ids_in_db: Set[str] = {
-        rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM articles")
-    }
-
-    def write_to_db(_article_records: List[Tuple[str, str]], _article_text_records: List[Tuple[str, str, str]]) -> None:
-        """Writes records to list.
-        _article_records (List[Tuple[str, str]]): `articles`entries with entity ID, ID.
-        _article_texts_records (List[Tuple[str, str, str]]): `articles_texts` entries with entity ID, title, content.
-        """
-        db_conn.cursor().executemany(
-            "INSERT OR IGNORE INTO articles (entity_id, id) VALUES (?, ?)",
-            _article_records,
-        )
-        db_conn.cursor().executemany(
-            "INSERT OR IGNORE INTO articles_texts (entity_id, title, content) VALUES (?, ?, ?)",
-            _article_text_records
-        )
-        db_conn.commit()
-
-    with bz2.open(wikipedia_input_path, mode="rb") as file:
-        pbar_params = {"total": limit} if limit else {}
-        with tqdm.tqdm(desc="Parsing article texts", miniters=1000, **pbar_params) as pbar:
-            n_articles = 0
-            n_viable_articles = 0
-            article_text = ""
-            article_title: Optional[str] = None
-            article_id: Optional[str] = None
-            reading_text = False
-            reading_revision = False
-            # Terms in article indicating it should be skipped (for redirects and disambiguation pages).
-            # Note: checks for redirection/disambiguation articles are not language-agnostic. Porting this to the
-            # generalized extraction needs to consider that.
-            skip_terms = ("#redirect", "#redirection", "{{disambiguation}}")
-            skip_article = False
-
-            for line in file:
-                if limit and pbar.n >= limit:
-                    break
-
-                clean_line = line.strip().decode("utf-8")
-
-                # Check if article is to be skipped.
-                cl_lower = clean_line.lower()
-                for skip_term in skip_terms:
-                    if skip_term in cl_lower:
-                        skip_article = True
-                        break
-                # Skip to next line if article is to be skipped.
-                if skip_article and clean_line != "</page>":
-                    continue
-
-                if clean_line == "<revision>":
-                    reading_revision = True
-                elif clean_line == "</revision>":
-                    reading_revision = False
-
-                # Start reading new page
-                if clean_line == "<page>":
-                    n_articles += 1
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-
-                # finished reading this page
-                elif clean_line == "</page>":
-                    if article_id and article_id not in article_ids_in_db:
-                        clean_text, entities = _process_wp_text(
-                            article_title, article_text, entity_title_to_id
-                        )
-                        if clean_text is not None:
-                            n_viable_articles += 1
-                            if article_title in entity_title_to_id:
-                                text_to_append = clean_text[:n_char_limit]
-                                for (to_replace, replacement) in (
-                                        ("(;", " "),
-                                        ("(,", " "),
-                                        (" ; ", " "),
-                                        (" , ", ""),
-                                        ("()", ""),
-                                ):
-                                    text_to_append = text_to_append.replace(
-                                        to_replace, replacement
-                                    )
-
-                                article_records.append((entity_title_to_id[article_title], article_id))
-                                article_texts_records.append(
-                                    (
-                                        entity_title_to_id[article_title],
-                                        article_title,
-                                        " ".join(text_to_append.split(" ")[:-1]),
-                                    )
-                                )
-                            pbar.update(1)
-
-                            if pbar.n % batch_size == 0:
-                                write_to_db(article_records, article_texts_records)
-                                article_records = []
-                                article_texts_records = []
-
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                    reading_text = False
-                    reading_revision = False
-                    skip_article = False
-
-                # start reading text within a page
-                if "<text" in clean_line:
-                    reading_text = True
-
-                if reading_text:
-                    article_text += " " + clean_line
-
-                # stop reading text within a page (we assume a new page doesn't start on the same line)
-                if "</text" in clean_line:
-                    reading_text = False
-
-                # read the ID of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        article_id = ids[0]
-                        if article_id in read_ids:
-                            # This should never happen ...
-                            print("Found duplicate article ID", article_id, clean_line)
-                        read_ids.add(article_id)
-
-                # read the title of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    titles = title_regex.search(clean_line)
-                    if titles:
-                        article_title = titles[0].strip()
-
-    if pbar.n % batch_size != 0:
-        write_to_db(article_records, article_texts_records)
-
-    print(
-        f"Processed {n_articles} articles.\n  "
-        f"Of which viable (with article ID and text): {n_viable_articles} ({n_viable_articles / n_articles * 100:.2f}%)"
-        f"\n    "
-        f"Of which processed (title in entity table): {pbar.n} ({pbar.n / n_viable_articles * 100:.2f}%)"
-    )
-
-
-def extract_demo_dump(
-    in_dump_path: Path, out_dump_path: Path, entity_titles: Set[str]
-) -> None:
-    """Writes information on those entities having at least one of the filter_terms in their description to a new dump
-    at location filtered_dump_path.
-    in_dump_path (Path): Path to complete Wikidata dump.
-    out_dump_path (Path): Path to filtered Wikidata dump.
-    entity_titles (Set[str]): Entity titles to include.
-    """
-
-    with bz2.open(in_dump_path, mode="rb") as in_file:
-        with bz2.open(out_dump_path, mode="wb") as out_file:
-            with tqdm.tqdm(desc="Filtering article texts", miniters=1, total=len(entity_titles)) as pbar:
-                reading_revision = False
-                line_cache: List[bytes] = []
-
-                for line in in_file:
-                    clean_line = line.strip().decode("utf-8")
-
-                    if clean_line == "<revision>":
-                        reading_revision = True
-                    elif clean_line == "</revision>":
-                        reading_revision = False
-
-                    # Start reading new page
-                    if clean_line == "<page>":
-                        line_cache = [line]
-                        article_title = None
-
-                    else:
-                        line_cache.append(line)
-                        # finished reading this page
-                        if clean_line == "</page>":
-                            line_cache.append(line)
-                            if article_title and article_title in entity_titles:
-                                out_file.writelines(line_cache)
-                                line_cache = []
-                                pbar.update(1)
-
-                            article_title = None
-                            reading_revision = False
-
-                    # read the title of this article (outside the revision portion of the document)
-                    if not reading_revision:
-                        titles = title_regex.search(clean_line)
-                        if titles:
-                            article_title = titles[0].strip()
-
-
-def _process_wp_text(
-    article_title: str, article_text: str, entity_title_to_id: Dict[str, str]
-) -> Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]:
-    """Process article text.
-    article_title (str): Article title.
-    article_text (str): Article text.
-    entity_title_to_id (Dict[str, str]): Map for entity/article titles to their IDs.
-    RETURNS (Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]): Cleaned text and list of entities in
-        article text.
-    """
-    # ignore meta Wikipedia pages
-    if ns_regex.match(article_title):
-        return None, None
-
-    # remove the text tags
-    text_search = text_tag_regex.sub("", article_text)
-    text_search = text_regex.search(text_search)
-    if text_search is None:
-        return None, None
-    text = text_search.group(0)
-
-    # stop processing if this is a redirect page
-    if text.startswith("#REDIRECT"):
-        return None, None
-
-    # get the raw text without markup etc, keeping only interwiki links
-    return _remove_links(_get_clean_wp_text(text), entity_title_to_id)
-
-
-def _get_clean_wp_text(article_text: str) -> str:
-    """Cleans article text.
-    article_text (str): Text to clean.
-    RETURNS (str): Cleaned text.
-    """
-    clean_text = article_text.strip()
-
-    # remove bolding & italic markup
-    clean_text = clean_text.replace("'''", "")
-    clean_text = clean_text.replace("''", "")
-
-    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
-    try_again = True
-    previous_length = len(clean_text)
-    while try_again:
-        clean_text = info_regex.sub(
-            "", clean_text
-        )  # non-greedy match excluding a nested {
-        if len(clean_text) < previous_length:
-            try_again = True
-        else:
-            try_again = False
-        previous_length = len(clean_text)
-
-    # remove HTML comments
-    clean_text = html_regex.sub("", clean_text)
-
-    # remove Category and File statements
-    clean_text = category_regex.sub("", clean_text)
-    clean_text = file_regex.sub("", clean_text)
-
-    # remove multiple =
-    while "==" in clean_text:
-        clean_text = clean_text.replace("==", "=")
-
-    clean_text = clean_text.replace(". =", ".")
-    clean_text = clean_text.replace(" = ", ". ")
-    clean_text = clean_text.replace("= ", ".")
-    clean_text = clean_text.replace(" =", "")
-
-    # remove refs (non-greedy match)
-    clean_text = ref_regex.sub("", clean_text)
-    clean_text = ref_2_regex.sub("", clean_text)
-
-    # remove additional wikiformatting
-    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
-    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
-
-    # change special characters back to normal ones
-    clean_text = clean_text.replace(r"&lt;", "<")
-    clean_text = clean_text.replace(r"&gt;", ">")
-    clean_text = clean_text.replace(r"&quot;", '"')
-    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
-    clean_text = clean_text.replace(r"&amp;", "&")
-
-    # remove multiple spaces
-    while "  " in clean_text:
-        clean_text = clean_text.replace("  ", " ")
-
-    return clean_text.strip()
-
-
-def _remove_links(
-    clean_text: str, entity_title_to_id: Dict[str, str]
-) -> Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]:
-    """Remove links from clean text.
-    clean_text (str): Cleaned article text.
-    entity_title_to_id (Dict[str, str]): Map for entity/article titles to their IDs.
-    RETURNS (Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]): Cleaned text without links, information
-        on entities in text.
-    """
-    # read the text char by char to get the right offsets for the interwiki links
-    entities = []
-    final_text = ""
-    open_read = 0
-    reading_text = True
-    reading_entity = False
-    reading_mention = False
-    reading_special_case = False
-    entity_buffer = ""
-    mention_buffer = ""
-    for index, letter in enumerate(clean_text):
-        if letter == "[":
-            open_read += 1
-        elif letter == "]":
-            open_read -= 1
-        elif letter == "|":
-            if reading_text:
-                final_text += letter
-            # switch from reading entity_title to mention in the [[entity_title|mention]] pattern
-            elif reading_entity:
-                reading_text = False
-                reading_entity = False
-                reading_mention = True
-            else:
-                reading_special_case = True
-        else:
-            if reading_entity:
-                entity_buffer += letter
-            elif reading_mention:
-                mention_buffer += letter
-            elif reading_text:
-                final_text += letter
-            else:
-                raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
-
-        if open_read > 2:
-            reading_special_case = True
-
-        if open_read == 2 and reading_text:
-            reading_text = False
-            reading_entity = True
-            reading_mention = False
-
-        # we just finished reading an entity_title
-        if open_read == 0 and not reading_text:
-            if "#" in entity_buffer or entity_buffer.startswith(":"):
-                reading_special_case = True
-            # Ignore cases with nested structures like File: handles etc
-            if not reading_special_case:
-                if not mention_buffer:
-                    mention_buffer = entity_buffer
-                start = len(final_text)
-                end = start + len(mention_buffer)
-                qid = entity_title_to_id.get(entity_buffer, None)
-                if qid:
-                    entities.append((mention_buffer, qid, start, end))
-                final_text += mention_buffer
-
-            entity_buffer = ""
-            mention_buffer = ""
-
-            reading_text = True
-            reading_entity = False
-            reading_mention = False
-            reading_special_case = False
-
-    return final_text, entities
-
-
-def is_dev(article_id: str) -> bool:
-    """Checks whether article is dev article.
-    article_id (str): Article ID.
-    RETURNS (bool): Whether article is dev article.
-    """
-    if not article_id:
-        return False
-    return article_id.endswith("3")
-
-
-def is_valid_article(doc_text: str) -> bool:
-    """Checks whether article is valid.
-    doc_text (str): Article text to check.
-    RETURNS (bool): Whether article text is valid.
-    """
-    # custom length cut-off
-    return 10 < len(doc_text) < 30000
-
-
-def is_valid_sentence(sent_text: str) -> bool:
-    """Checks whether sentence is valid.
-    sent_text (str): Sentence to check.
-    RETURNS (bool): Whether sentence is valid.
-    """
-    if not 10 < len(sent_text) < 3000:
-        # custom length cut-off
-        return False
-
-    if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
-        # remove 'enumeration' sentences (occurs often on Wikipedia)
-        return False
-
-    return True
diff --git a/pipelines/tagger_parser_ud/configs/default.cfg b/pipelines/tagger_parser_ud/configs/default.cfg
index 964acd530..43528eafd 100644
--- a/pipelines/tagger_parser_ud/configs/default.cfg
+++ b/pipelines/tagger_parser_ud/configs/default.cfg
@@ -159,8 +159,8 @@ compound = 1.001
 t = 0.0
 
 [training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
+@loggers = "spacy.ClearMLLogger.v1"
+project_name = "test_project"
 
 [training.optimizer]
 @optimizers = "Adam.v1"

From 432ce49a785ed3777fd4e80cbe181a997d3e0e21 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 15:55:24 +0200
Subject: [PATCH 04/38] Update project.yml and test.

---
 benchmarks/nel/project.yml           | 3 +++
 benchmarks/nel/requirements.txt      | 3 +--
 benchmarks/nel/test_nel_benchmark.py | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 24551fd16..d17368b72 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -19,6 +19,9 @@ workflows:
     - download_mewsli9
     - preprocess
     - download_model
+    - wiki_clone_wikid
+    - wiki_download_assets
+    - wiki_parse
     - parse_wiki_dumps
     - create_kb
     - compile_corpora
diff --git a/benchmarks/nel/requirements.txt b/benchmarks/nel/requirements.txt
index cdc9e816c..2fe786dcc 100644
--- a/benchmarks/nel/requirements.txt
+++ b/benchmarks/nel/requirements.txt
@@ -4,5 +4,4 @@ prettytable
 scikit-learn
 fuzzyset2
 spacyfishing
-virtualenv
-pysqlite3-binary
\ No newline at end of file
+virtualenv
\ No newline at end of file
diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index 2265f22f2..cd4ab9b6b 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -13,7 +13,9 @@ def test_nel_benchmark():
     project_run(root, "download_mewsli9", capture=True)
     project_run(root, "preprocess", capture=True)
     project_run(root, "download_model", capture=True)
-    project_run(root, "parse_wiki_dumps", capture=True)
+    project_run(root, "wiki_clone_wikid", capture=True)
+    project_run(root, "wiki_download_assets", capture=True)
+    project_run(root, "wiki_parse", capture=True)
     project_run(root, "create_kb", capture=True)
     project_run(root, "compile_corpora", capture=True)
     project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1})

From 0d9cdd4c737dfd02f767c04450667f3163b2ffbb Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 16:01:23 +0200
Subject: [PATCH 05/38] Fix project.yml.

---
 benchmarks/nel/project.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index d17368b72..5686ccc8a 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -22,7 +22,6 @@ workflows:
     - wiki_clone_wikid
     - wiki_download_assets
     - wiki_parse
-    - parse_wiki_dumps
     - create_kb
     - compile_corpora
     - train
@@ -137,7 +136,7 @@ commands:
       - "evaluation/${vars.dataset}"
 
   - name: delete_wiki_db
-    help: "Deletes SQLite database generated in step parse_wiki_dumps with data parsed from Wikidata and Wikipedia dump."
+    help: "Deletes SQLite database generated in step wiki_parse with data parsed from Wikidata and Wikipedia dump."
     script:
       - "rm -f assets/wiki/wiki.sqlite3"
     deps:

From de3a465bfceed3e1972a0069c59d6beddc0e3e52 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 17:04:44 +0200
Subject: [PATCH 06/38] Update project.yml to install wikid.

---
 benchmarks/nel/project.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 5686ccc8a..00053d990 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -59,6 +59,7 @@ commands:
       # To be updated after merge.
       - git clone https://github.com/rmitsch/wikid.git && cd wikid && git checkout feature/wiki-parsing && cd ..
       - pip install -r wikid/requirements.txt
+      - pip install -e wikid
 
   - name: wiki_download_assets
     help: "Download Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"

From 1ae4ea043a337a62d43e22fc4dc94d323437bf59 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 17:06:11 +0200
Subject: [PATCH 07/38] Add setup for wikid.

---
 benchmarks/nel/project.yml                 | 2 +-
 benchmarks/nel/scripts/datasets/dataset.py | 2 +-
 benchmarks/nel/scripts/datasets/utils.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 00053d990..cb1a62b6e 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -57,7 +57,7 @@ commands:
     help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
     script:
       # To be updated after merge.
-      - git clone https://github.com/rmitsch/wikid.git && cd wikid && git checkout feature/wiki-parsing && cd ..
+      - git clone https://github.com/rmitsch/wikid.git --branch feature/wiki-parsing
       - pip install -r wikid/requirements.txt
       - pip install -e wikid
 
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index 89c9ff61a..698a806ee 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -24,7 +24,7 @@
 from spacy.pipeline import EntityLinker
 
 from schemas import Annotation, Entity
-from wiki import wiki_dump_api
+from wikid import wiki_dump_api
 from . import evaluation
 from utils import get_logger
 
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/scripts/datasets/utils.py
index 743a6bf3e..5605d76cf 100644
--- a/benchmarks/nel/scripts/datasets/utils.py
+++ b/benchmarks/nel/scripts/datasets/utils.py
@@ -4,7 +4,7 @@
 import tqdm
 from spacy.tokens import Token, Span, Doc
 from schemas import Entity, Annotation
-from wiki import wiki_dump_api
+from wikid import wiki_dump_api
 
 
 def _does_token_overlap_with_annotation(

From f5d6d894aeb425df8a1242635e28b849cf9f767d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 24 Oct 2022 17:07:33 +0200
Subject: [PATCH 08/38] Update test and step sequence.

---
 benchmarks/nel/project.yml           | 20 ++++++++++----------
 benchmarks/nel/test_nel_benchmark.py |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index cb1a62b6e..56f1b2b79 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -17,9 +17,9 @@ directories: ["assets", "training", "configs", "scripts", "corpora", "temp", "ev
 workflows:
   all:
     - download_mewsli9
-    - preprocess
     - download_model
     - wiki_clone_wikid
+    - preprocess
     - wiki_download_assets
     - wiki_parse
     - create_kb
@@ -39,15 +39,6 @@ commands:
     outputs:
       - assets/mewsli_9/
 
-  - name: preprocess
-    help: Preprocess test datasets.
-    script:
-      - "python ./scripts/clean_data.py ${vars.dataset}"
-    deps:
-      - "assets/${vars.dataset}/raw"
-    outputs:
-      - "assets/${vars.dataset}/clean"
-
   - name: download_model
     help: "Download a model with pretrained vectors and NER component."
     script:
@@ -61,6 +52,15 @@ commands:
       - pip install -r wikid/requirements.txt
       - pip install -e wikid
 
+  - name: preprocess
+    help: Preprocess test datasets.
+    script:
+      - "python ./scripts/clean_data.py ${vars.dataset}"
+    deps:
+      - "assets/${vars.dataset}/raw"
+    outputs:
+      - "assets/${vars.dataset}/clean"
+
   - name: wiki_download_assets
     help: "Download Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
     script:
diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index cd4ab9b6b..e64d9e0c9 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -11,9 +11,9 @@ def test_nel_benchmark():
     root = Path(__file__).parent
     project_assets(root)
     project_run(root, "download_mewsli9", capture=True)
-    project_run(root, "preprocess", capture=True)
     project_run(root, "download_model", capture=True)
     project_run(root, "wiki_clone_wikid", capture=True)
+    project_run(root, "preprocess", capture=True)
     project_run(root, "wiki_download_assets", capture=True)
     project_run(root, "wiki_parse", capture=True)
     project_run(root, "create_kb", capture=True)

From a557e78b24750093624de3871c938966815d37ec Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:11:48 +0200
Subject: [PATCH 09/38] Various fixes w.r.t. wikid integration.

---
 benchmarks/nel/configs/nel.cfg                |  13 +-
 benchmarks/nel/project.yml                    |  75 +++++----
 .../nel/scripts/candidate_generation/base.py  |   9 +-
 benchmarks/nel/scripts/clean_data.py          |   5 +-
 benchmarks/nel/scripts/compare_evaluations.py |   5 +-
 benchmarks/nel/scripts/compile_corpora.py     |   7 +-
 benchmarks/nel/scripts/create_kb.py           |  19 ---
 benchmarks/nel/scripts/datasets/dataset.py    | 159 +++++-------------
 benchmarks/nel/scripts/datasets/mewsli_9.py   |  12 +-
 benchmarks/nel/scripts/datasets/utils.py      |  25 +--
 benchmarks/nel/scripts/evaluate.py            |   8 +-
 benchmarks/nel/scripts/parse_corpus.py        |  16 ++
 benchmarks/nel/scripts/schemas.py             |  28 ---
 benchmarks/nel/scripts/train.sh               |  18 +-
 benchmarks/nel/scripts/utils.py               |   8 -
 benchmarks/nel/temp/.gitignore                |   3 -
 benchmarks/nel/test_nel_benchmark.py          |   9 +-
 .../tagger_parser_ud/configs/default.cfg      |   2 +-
 18 files changed, 164 insertions(+), 257 deletions(-)
 delete mode 100644 benchmarks/nel/scripts/create_kb.py
 create mode 100644 benchmarks/nel/scripts/parse_corpus.py
 delete mode 100644 benchmarks/nel/scripts/schemas.py
 delete mode 100644 benchmarks/nel/temp/.gitignore

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 31fa5d26c..5fb7bd579 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -1,5 +1,6 @@
 [paths]
 dataset_name = null
+language = null
 train = ""
 dev = ""
 raw = null
@@ -39,13 +40,13 @@ entity_vector_length = 64
 incl_context = true
 incl_prior = true
 labels_discard = []
-get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
 
-;[components.entity_linker.get_candidates]
-;@misc = "FuzzyStringGetCandidates.v1"
-;dataset_name = "${paths.dataset_name}"
-;max_n_candidates = 50
-;similarity_cutoff = 0.5
+[components.entity_linker.get_candidates]
+@misc = "FuzzyStringGetCandidates.v1"
+dataset_name = "${paths.dataset_name}"
+language = "${paths.language}"
+max_n_candidates = 50
+similarity_cutoff = 0.5
 
 [components.entity_linker.model]
 @architectures = "spacy.EntityLinker.v1"
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 56f1b2b79..8b3917e9c 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -1,28 +1,29 @@
 title: 'NEL Benchmark'
 description: "Pipeline for benchmarking NEL approaches (incl. candidate generation and entity disambiguation)."
 vars:
-  run: "cg-default"
+  run: "cg-lexical"
+  language: "en"
   config: "nel.cfg"
   vectors_model: "en_core_web_lg"
   version: "0.0.6"
   dataset: "mewsli_9"
   gpu_id: ""
   download_all_wiki_assets: ""  # "--extra" to download full Wiki dumps.
-  filter: "--filter"  # Whether to only use parts of Wiki data and corpus containing filter terms.
+  filter: "True"  # Whether to only use parts of Wiki data and corpus containing filter terms.
   training_max_steps: 1000
   eval_highlight_metric: "F"  # one of ("F", "r", "p")
 
-directories: ["assets", "training", "configs", "scripts", "corpora", "temp", "evaluation"]
+directories: ["assets", "training", "configs", "scripts", "corpora", "evaluation"]
 
 workflows:
   all:
     - download_mewsli9
     - download_model
-    - wiki_clone_wikid
+    - wikid_clone
     - preprocess
-    - wiki_download_assets
-    - wiki_parse
-    - create_kb
+    - wikid_download_assets
+    - wikid_parse
+    - wikid_create_kb
     - compile_corpora
     - train
     - evaluate
@@ -44,62 +45,70 @@ commands:
     script:
       - "python -m spacy download ${vars.vectors_model}"
 
-  - name: wiki_clone_wikid
+  - name: wikid_clone
     help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
     script:
       # To be updated after merge.
       - git clone https://github.com/rmitsch/wikid.git --branch feature/wiki-parsing
       - pip install -r wikid/requirements.txt
-      - pip install -e wikid
 
   - name: preprocess
     help: Preprocess test datasets.
     script:
-      - "python ./scripts/clean_data.py ${vars.dataset}"
+      - "env PYTHONPATH=. python ./scripts/clean_data.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/raw"
     outputs:
       - "assets/${vars.dataset}/clean"
 
-  - name: wiki_download_assets
+  - name: wikid_download_assets
     help: "Download Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
     script:
       - "spacy project assets wikid ${vars.download_all_wiki_assets}"
     outputs:
       - "wikid/assets/"
 
-  - name: wiki_parse
+  - name: wikid_parse
     help: "Parse Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
     script:
-      - "spacy project run parse wikid --vars.use_filtered_dumps ${vars.filter}"
+      - "spacy project run parse wikid --vars.language ${vars.language} --vars.filter True"
     outputs:
-      - "assets/wiki/wiki.sqlite3"
+      - "wikid/output/${vars.language}/wiki.sqlite3"
 
-  - name: create_kb
+  - name: wikid_create_kb
     help: "Create the knowledge base and write it to file."
     script:
-      - "python ./scripts/create_kb.py ${vars.dataset} ${vars.vectors_model}"
+      - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model}"
+    deps:
+      - "wikid/output/${vars.language}/wiki.sqlite3"
+    outputs_no_cache:
+      - "wikid/output/${vars.language}/kb"
+      - "wikid/output/${vars.language}/nlp"
+
+    # todo generate annotations - this was done in create_kb in the past. would make sense in compile_corpora?
+
+  - name: parse_corpus
+    help: "Parse corpus to generate entity and annotation lookups used for corpora compilation."
+    script:
+      - "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language} ${vars.vectors_model}"
     deps:
       - "assets/${vars.dataset}/clean"
-      - "assets/wiki/wiki.sqlite3"
+      - "wikid/output/${vars.language}/wiki.sqlite3"
     outputs:
       - "assets/${vars.dataset}/entities.pkl"
       - "assets/${vars.dataset}/entities_failed_lookup.pkl"
       - "assets/${vars.dataset}/annotations.pkl"
-    outputs_no_cache:
-      - "temp/${vars.dataset}/kb"
-      - "temp/${vars.dataset}/nlp"
 
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
-      - "python ./scripts/compile_corpora.py ${vars.dataset} ${vars.filter}"
+      - "env PYTHONPATH=. python ./scripts/compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}"
     deps:
       - "assets/${vars.dataset}/entities.pkl"
       - "assets/${vars.dataset}/entities_failed_lookups.pkl"
       - "assets/${vars.dataset}/annotations.pkl"
-      - "temp/${vars.dataset}/kb"
-      - "temp/${vars.dataset}/nlp"
+      - "wikid/output/${vars.language}/kb"
+      - "wikid/output/${vars.language}/nlp"
       - "configs/datasets.yml"
     outputs:
       - "corpora/${vars.dataset}/train.spacy"
@@ -109,22 +118,22 @@ commands:
   - name: train
     help: "Train a new Entity Linking component. Pass --vars.gpu_id GPU_ID to train with GPU. Training with some datasets may take a long time!"
     script:
-      - "bash scripts/train.sh ${vars.dataset} '${vars.run}' ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
+      - "bash scripts/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
     outputs:
       - "training/${vars.dataset}/${vars.run}"
     deps:
-      - "temp/${vars.dataset}/kb"
-      - "temp/${vars.dataset}/nlp"
+      - "wikid/output/${vars.language}/kb"
+      - "wikid/output/${vars.language}/nlp"
       - "corpora/${vars.dataset}/train.spacy"
       - "corpora/${vars.dataset}/dev.spacy"
 
   - name: evaluate
     help: "Evaluate on the test set."
     script:
-      - "env PYTHONPATH=. python ./scripts/evaluate.py ${vars.dataset} '${vars.run}'"
+      - "env PYTHONPATH=. python ./scripts/evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
     deps:
       - "training/${vars.dataset}/${vars.run}/model-best"
-      - "temp/${vars.dataset}/nlp"
+      - "wikid/output/${vars.language}/nlp"
       - "corpora/${vars.dataset}/dev.spacy"
     outputs:
       - "evaluation/${vars.dataset}"
@@ -132,22 +141,22 @@ commands:
   - name: compare_evaluations
     help: "Compare available set of evaluation runs."
     script:
-      - "env PYTHONPATH=. python ./scripts/compare_evaluations.py ${vars.dataset} --highlight-criterion ${vars.eval_highlight_metric}"
+      - "env PYTHONPATH=. python ./scripts/compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
     deps:
       - "evaluation/${vars.dataset}"
 
   - name: delete_wiki_db
     help: "Deletes SQLite database generated in step wiki_parse with data parsed from Wikidata and Wikipedia dump."
     script:
-      - "rm -f assets/wiki/wiki.sqlite3"
+      - "rm -f wikid/output/${vars.language}/wiki.sqlite3"
     deps:
-      - "assets/wiki/wiki.sqlite3"
+      - "wikid/output/${vars.language}/wiki.sqlite3"
 
   - name: clean
-    help: "Remove intermediate files for specified dataset (excluding Wiki resources and database)"
+    help: "Remove intermediate files for specified dataset and language (excluding Wiki resources and database)."
     script:
       - "rm -rf training/${vars.dataset}"
       - "rm -rf corpora/${vars.dataset}"
-      - "rm -rf temp/${vars.dataset}"
       - "rm -rf assets/${vars.dataset}"
       - "rm -rf evaluation/${vars.dataset}"
+      - "rm -rf wikid/output/${vars.language}"
diff --git a/benchmarks/nel/scripts/candidate_generation/base.py b/benchmarks/nel/scripts/candidate_generation/base.py
index b105b7598..38f25d035 100644
--- a/benchmarks/nel/scripts/candidate_generation/base.py
+++ b/benchmarks/nel/scripts/candidate_generation/base.py
@@ -19,19 +19,20 @@ class NearestNeighborCandidateSelector(abc.ABC):
     _entities: Dict[str, Any] = {}
 
     def __call__(
-        self, kb: KnowledgeBase, span: Span, dataset_id: str, max_n_candidates: int, **kwargs
+        self, kb: KnowledgeBase, span: Span, dataset_id: str, language: str, max_n_candidates: int, **kwargs
     ) -> Iterable[Candidate]:
         """Identifies entity candidates.
-        dataset_id (str): ID of dataset for which to select candidates.
-        max_n_candidates (int): Numbers of nearest neighbours to query.
         kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates.
         span (Span): Span to match potential entity candidates with.
+        dataset_id (str): ID of dataset for which to select candidates.
+        language (str): Language.
+        max_n_candidates (int): Numbers of nearest neighbours to query.
         RETURNS (Iterator[Candidate]): Candidates for specified entity.
         """
 
         if self._pipeline is None:
             # Load pipeline and pickled entities. Run name doesn't matter for either of those.
-            paths = Dataset.assemble_paths(dataset_id, "")
+            paths = Dataset.assemble_paths(dataset_id, "", language)
             self._pipeline = spacy.load(paths["nlp_base"])
             with open(paths["entities"], "rb") as file:
                 self._entities[dataset_id] = pickle.load(file)
diff --git a/benchmarks/nel/scripts/clean_data.py b/benchmarks/nel/scripts/clean_data.py
index f30b5147e..a7a7e1b3b 100755
--- a/benchmarks/nel/scripts/clean_data.py
+++ b/benchmarks/nel/scripts/clean_data.py
@@ -7,12 +7,13 @@
 from datasets.dataset import Dataset
 
 
-def main(dataset_name: str):
+def main(dataset_name: str, language: str):
     """
     Removes/fixes error in downloaded datasets.
     dataset_name (str): Dataset name.
+    language (str): Language.
     """
-    Dataset.generate_from_id(dataset_name).clean_assets()
+    Dataset.generate_from_id(dataset_name, language).clean_assets()
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/nel/scripts/compare_evaluations.py b/benchmarks/nel/scripts/compare_evaluations.py
index 973d6b3a1..86fd87d2a 100644
--- a/benchmarks/nel/scripts/compare_evaluations.py
+++ b/benchmarks/nel/scripts/compare_evaluations.py
@@ -4,12 +4,13 @@
 import typer
 
 
-def main(dataset_name: str, highlight_criterion: str = "F"):
+def main(dataset_name: str, language: str, highlight_criterion: str = "F"):
     """Compare evaluations across all available runs for this dataset.
     dataset_name (str): Name of dataset to evaluate on.
+    language (str): Language.
     highlight_criterion (str): Criterion to highlight in table. One of ("F", "r", "p").
     """
-    Dataset.generate_from_id(dataset_name).compare_evaluations(highlight_criterion=highlight_criterion)
+    Dataset.generate_from_id(dataset_name, language).compare_evaluations(highlight_criterion=highlight_criterion)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/nel/scripts/compile_corpora.py b/benchmarks/nel/scripts/compile_corpora.py
index bc477f6e8..dce834ce8 100644
--- a/benchmarks/nel/scripts/compile_corpora.py
+++ b/benchmarks/nel/scripts/compile_corpora.py
@@ -2,18 +2,19 @@
 
 import typer
 from datasets.dataset import Dataset
-from utils import read_filter_terms
+from wikid import read_filter_terms
 
 
-def main(dataset_name: str, use_filter_terms: bool = typer.Option(False, "--filter")):
+def main(dataset_name: str, language: str, use_filter_terms: bool = typer.Argument(False)):
     """Create corpora in spaCy format.
     dataset_name (str): Dataset name.
+    language (str): Language.
     use_filter_terms (bool): Whether to use the filter terms defined in the dataset config. If True, only documents
         containing at least one of the specified terms will be included in corpora. If False, all documents are
         included.
     """
     # Run name isn't relevant for corpora compilation.
-    Dataset.generate_from_id(dataset_name).compile_corpora(read_filter_terms() if use_filter_terms else None)
+    Dataset.generate_from_id(dataset_name, language).compile_corpora(read_filter_terms() if use_filter_terms else None)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/nel/scripts/create_kb.py b/benchmarks/nel/scripts/create_kb.py
deleted file mode 100644
index aa5310853..000000000
--- a/benchmarks/nel/scripts/create_kb.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""
-Functionality for creating the knowledge base from downloaded assets and by querying Wikipedia's API.
-"""
-
-import typer
-from datasets.dataset import Dataset
-
-
-def main(dataset_name: str, vectors_model: str):
-    """Create the Knowledge Base in spaCy and write it to file.
-
-    dataset_name (str): Dataset name.
-    vectors_model (str): Name of model with word vectors to use.
-    """
-    Dataset.generate_from_id(dataset_name).create_knowledge_base(vectors_model)
-
-
-if __name__ == "__main__":
-    typer.run(main)
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index 698a806ee..e953a17e8 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -23,8 +23,7 @@
 from spacy.training import Example
 from spacy.pipeline import EntityLinker
 
-from schemas import Annotation, Entity
-from wikid import wiki_dump_api
+from wikid import schemas
 from . import evaluation
 from utils import get_logger
 
@@ -35,42 +34,46 @@
 class Dataset(abc.ABC):
     """Base class for all datasets used in this benchmark."""
 
-    def __init__(self, run_name: str):
+    def __init__(self, run_name: str, language: str):
         """Initializes new Dataset.
         run_name (str): Run name.
+        language (str): Language.
         """
 
         self._run_name = run_name
-        self._paths = self.assemble_paths(self.name, self._run_name)
+        self._language = language
+        self._paths = self.assemble_paths(self.name, self._run_name, self._language)
 
         with open(self._paths["root"] / "configs" / "datasets.yml", "r") as stream:
             self._options = yaml.safe_load(stream)[self.name]
 
-        self._entities: Optional[Dict[str, Entity]] = None
+        self._entities: Optional[Dict[str, schemas.Entity]] = None
         self._failed_entity_lookups: Optional[Set[str]] = None
-        self._annotations: Optional[Dict[str, List[Annotation]]] = None
+        self._annotations: Optional[Dict[str, List[schemas.Annotation]]] = None
         self._kb: Optional[KnowledgeBase] = None
         self._nlp_base: Optional[Language] = None
         self._nlp_best: Optional[Language] = None
         self._annotated_docs: Optional[List[Doc]] = None
 
     @staticmethod
-    def assemble_paths(dataset_name: str, run_name: str) -> Dict[str, Path]:
+    def assemble_paths(dataset_name: str, run_name: str, language: str) -> Dict[str, Path]:
         """Assemble paths w.r.t. dataset ID.
         dataset_name (str): Dataset name.
         run_name (str): Run name.
+        language (str): Language.
         RETURNS (Dict[str, Path]): Dictionary with internal resource name to path.
         """
 
         root_path = Path(os.path.abspath(__file__)).parent.parent.parent
+        wikid_path = root_path / "wikid" / "output"
         assets_path = root_path / "assets" / dataset_name
 
         return {
             "root": root_path,
             "evaluation": root_path / "configs" / "evaluation.yml",
             "assets": assets_path,
-            "nlp_base": root_path / "temp" / dataset_name / "nlp",
-            "kb": root_path / "temp" / dataset_name / "kb",
+            "nlp_base": wikid_path / language / "nlp",
+            "kb": wikid_path / language / "kb",
             "entities": assets_path / "entities.pkl",
             "failed_entity_lookups": assets_path / "entities_failed_lookups.pkl",
             "annotations": assets_path / "annotations.pkl",
@@ -83,99 +86,11 @@ def name(self) -> str:
         """Returns dataset name."""
         raise NotImplementedError
 
-    def create_knowledge_base(self, model_name: str, **kwargs) -> None:
-        """Creates and serializes knowledge base.
-        vectors_model (str): Name of model with word vectors to use.
-        """
-
-        self._nlp_base = spacy.load(
-            model_name, exclude=["tagger", "lemmatizer", "attribute_ruler"]
-        )
-        logger.info("Parsing external corpus")
-        (
-            self._entities,
-            self._failed_entity_lookups,
-            self._annotations,
-        ) = self._parse_external_corpus(**kwargs)
-
-        logger.info(
-            f"Constructing knowledge base with {len(self._entities)} entries and "
-            f"{len(self._failed_entity_lookups)} failed lookups."
-        )
-        self._kb = KnowledgeBase(
-            vocab=self._nlp_base.vocab,
-            entity_vector_length=self._nlp_base.vocab.vectors_length,
-        )
-        entity_list: List[str] = []
-        count_list: List[int] = []
-        vector_list: List[numpy.ndarray] = []  # type: ignore
-        qids = list(self._entities.keys())
-        desc_vectors = [
-            doc.vector for doc in
-            tqdm.tqdm(
-                self._nlp_base.pipe(
-                    texts=[
-                        self._entities[qid].description if self._entities[qid].description
-                        else (
-                            self._entities[qid].article_text[:500] if self._entities[qid].article_text
-                            else self._entities[qid].name
-                        )
-                        for qid in qids
-                    ],
-                    n_process=-1
-                ),
-                total=len(self._entities),
-                desc="Inferring entity embeddings"
-            )
-        ]
-        for qid, desc_vector in zip(qids, desc_vectors):
-            entity_list.append(qid)
-            count_list.append(self._entities[qid].count)
-            vector_list.append(
-                desc_vector
-                if isinstance(desc_vector, numpy.ndarray)
-                else desc_vector.get()
-            )
-        self._kb.set_entities(
-            entity_list=entity_list, vector_list=vector_list, freq_list=count_list
-        )
-
-        # Add aliases with normalized priors to KB.
-        alias_entity_prior_probs = wiki_dump_api.load_alias_entity_prior_probabilities(
-            set(self._entities.keys())
-        )
-        for alias, entity_prior_probs in alias_entity_prior_probs.items():
-            self._kb.add_alias(
-                alias=alias,
-                entities=[epp[0] for epp in entity_prior_probs],
-                probabilities=[epp[1] for epp in entity_prior_probs],
-            )
-        # Add pseudo aliases for easier lookup with new candidate generators.
-        for entity_id in entity_list:
-            self._kb.add_alias(
-                alias="_" + entity_id + "_", entities=[entity_id], probabilities=[1]
-            )
-
-        # Serialize knowledge base & entity information.
-        for to_serialize in (
-            (self._paths["entities"], self._entities),
-            (self._paths["failed_entity_lookups"], self._failed_entity_lookups),
-            (self._paths["annotations"], self._annotations),
-        ):
-            with open(to_serialize[0], "wb") as fp:
-                pickle.dump(to_serialize[1], fp)
-        self._kb.to_disk(self._paths["kb"])
-        if not os.path.exists(self._paths["nlp_base"]):
-            os.mkdir(self._paths["nlp_base"])
-        self._nlp_base.to_disk(self._paths["nlp_base"])
-        logger.info("Successfully constructed knowledge base.")
-
     def compile_corpora(self, filter_terms: Optional[Set[str]] = None) -> None:
         """Creates train/dev/test corpora for dataset.
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
         """
-
         self._load_resource("entities")
         self._load_resource("failed_entity_lookups")
         self._load_resource("annotations")
@@ -192,10 +107,34 @@ def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> Lis
         """
         raise NotImplementedError
 
-    def _parse_external_corpus(
-        self, **kwargs
-    ) -> Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]:
-        """Parses external corpus. Loads data on entities and mentions.
+    def parse_corpus(self, **kwargs) -> None:
+        """Parses corpus. Loads data on entities and mentions.
+        Populates self._entities, self._failed_entity_lookups, self._annotations.
+        RETURNS (Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]): entities, titles of failed entity
+            lookups, annotations.
+        """
+        self._load_resource("nlp_base")
+        logger.info("Parsing external corpus")
+        (
+            self._entities,
+            self._failed_entity_lookups,
+            self._annotations,
+        ) = self._parse_corpus(**kwargs)
+
+        # Serialize entity information.
+        for to_serialize in (
+            (self._paths["entities"], self._entities),
+            (self._paths["failed_entity_lookups"], self._failed_entity_lookups),
+            (self._paths["annotations"], self._annotations),
+        ):
+            with open(to_serialize[0], "wb") as fp:
+                pickle.dump(to_serialize[1], fp)
+        logger.info("Successfully parsed corpus.")
+
+    def _parse_corpus(
+            self, **kwargs
+    ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]:
+        """Parses corpus. Loads data on entities and mentions.
         Populates self._entities, self._failed_entity_lookups, self._annotations.
         RETURNS (Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]): entities, titles of failed entity
             lookups, annotations.
@@ -296,22 +235,13 @@ def evaluate(self, run_name: str) -> None:
                 if ent.text.lower().startswith("the ") else ent
                 for ent in doc.ents
             ]
-        tmp_docs_path = "/tmp/docs.spacy"
-        if not os.path.exists(tmp_docs_path):
-            docs = tqdm.tqdm(
-                self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
-                desc="Inferring entities for test set",
-                total=len(docs)
-            )
-            DocBin(docs=docs).to_disk(tmp_docs_path)
-        else:
-            docs = list(DocBin().from_disk(tmp_docs_path).get_docs(self._nlp_best.vocab))
+
         test_set = [
             Example(predicted_doc, doc)
             for predicted_doc, doc in zip(
                 [
                     doc for doc in tqdm.tqdm(
-                        docs,  # self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
+                        self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
                         desc="Inferring entities for test set",
                         total=len(docs)
                     )
@@ -453,11 +383,12 @@ def compare_evaluations(self, highlight_criterion: str) -> None:
 
     @classmethod
     def generate_from_id(
-        cls: Type[DatasetType], dataset_name: str, run_name: str = "", **kwargs
+        cls: Type[DatasetType], dataset_name: str, language: str, run_name: str = "", **kwargs
     ) -> DatasetType:
         """Generates dataset instance from ID.
         dataset_name (str): Dataset name.
         run_name (str): Run name.
+        language (str): Language.
         RETURNS (DatasetType): Instance of dataset with type determined by dataset ID.
         """
 
@@ -474,7 +405,7 @@ def generate_from_id(
             len(classes) == 1
         ), f"Module {module_name} should contain exactly one Dataset class definition."
 
-        return classes[0][1](run_name=run_name, **kwargs)
+        return classes[0][1](run_name=run_name, language=language, **kwargs)
 
     def clean_assets(self) -> None:
         """Cleans assets, i.e. removes/changes errors in the external datasets that cannot easily be cleaned
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
index d01b00702..1623f73c8 100644
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ b/benchmarks/nel/scripts/datasets/mewsli_9.py
@@ -8,7 +8,7 @@
 
 from datasets.dataset import Dataset
 from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation
-from schemas import Entity, Annotation
+from wikid import schemas
 
 
 class Mewsli9Dataset(Dataset):
@@ -18,11 +18,11 @@ class Mewsli9Dataset(Dataset):
     def name(self) -> str:
         return "mewsli_9"
 
-    def _parse_external_corpus(
+    def _parse_corpus(
         self, **kwargs
-    ) -> Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]:
+    ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]:
         entity_qids: Set[str] = set()
-        annotations: Dict[str, List[Annotation]] = {}
+        annotations: Dict[str, List[schemas.Annotation]] = {}
 
         with open(
             self._paths["assets"] / "clean" / "en" / "mentions.tsv", encoding="utf-8"
@@ -34,7 +34,7 @@ def _parse_external_corpus(
                 if row["docid"] not in annotations:
                     annotations[row["docid"]] = []
                 annotations[row["docid"]].append(
-                    Annotation(
+                    schemas.Annotation(
                         entity_name=row["url"].split("/")[-1].replace("_", " "),
                         entity_id=row["qid"],
                         start_pos=int(row["position"]),
@@ -42,7 +42,7 @@ def _parse_external_corpus(
                     )
                 )
 
-        entities, failed_entity_lookups, _ = fetch_entity_information(tuple(entity_qids))
+        entities, failed_entity_lookups, _ = fetch_entity_information(tuple(entity_qids), self._language)
 
         return entities, failed_entity_lookups, annotations
 
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/scripts/datasets/utils.py
index 5605d76cf..5d7d4688a 100644
--- a/benchmarks/nel/scripts/datasets/utils.py
+++ b/benchmarks/nel/scripts/datasets/utils.py
@@ -1,10 +1,9 @@
 """ Utilities for NEL benchmark. """
 
-from typing import Dict, List, Set, Tuple, Iterable
+from typing import Dict, List, Set, Tuple
 import tqdm
 from spacy.tokens import Token, Span, Doc
-from schemas import Entity, Annotation
-from wikid import wiki_dump_api
+from wikid import schemas, load_entities
 
 
 def _does_token_overlap_with_annotation(
@@ -25,11 +24,13 @@ def _does_token_overlap_with_annotation(
 
 def fetch_entity_information(
     values: Tuple[str, ...],
-    batch_size: int = 1000,
-) -> Tuple[Dict[str, Entity], Set[str], Dict[str, str]]:
+    language: str,
+    batch_size: int = 5000,
+) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, str]]:
     """
     Fetches information on entities from database.
     values (Tuple[str]): Values for key to look up.
+    language (str): Language.
     db_conn (sqlite3.Connection): Database connection.
     batch_size (int): Number of entity titles to resolve in the same API request. Between 1 and 50.
     RETURNS (Tuple[Dict[str, Entity], Set[str], Dict[str, str]]): Updated entities, failed lookups, mappings of titles
@@ -41,11 +42,11 @@ def fetch_entity_information(
     pbar = tqdm.tqdm(total=len(values))
     failed_lookups: Set[str] = set()
     name_qid_map: Dict[str, str] = {}
-    entities: Dict[str, Entity] = {}
+    entities: Dict[str, schemas.Entity] = {}
 
     for i in range(0, len(values), batch_size):
         chunk = tuple([v.replace("_", " ") for v in values[i : i + batch_size]])
-        entities_chunk = wiki_dump_api.load_entities(chunk)
+        entities_chunk = load_entities(language, chunk)
         _failed_lookups = set(chunk)
 
         # Replace entity titles keys in dict with Wikidata QIDs. Add entity description.
@@ -64,10 +65,10 @@ def fetch_entity_information(
 
 def create_spans_from_doc_annotation(
     doc: Doc,
-    entities_info: Dict[str, Entity],
-    annotations: List[Annotation],
+    entities_info: Dict[str, schemas.Entity],
+    annotations: List[schemas.Annotation],
     harmonize_with_doc_ents: bool,
-) -> Tuple[List[Span], List[Annotation]]:
+) -> Tuple[List[Span], List[schemas.Annotation]]:
     """Creates spans from annotations for one document.
     doc (Doc): Document for whom to create spans.
     entities_info (Dict[str, Entity]): All available entities.
@@ -83,8 +84,8 @@ def create_spans_from_doc_annotation(
         (ent.start_char + (0 if not ent.text.lower().startswith("the ") else 4), ent.end_char)
         for ent in doc.ents
     }
-    doc_annots: List[Annotation] = []
-    overlapping_doc_annotations: List[Annotation] = []
+    doc_annots: List[schemas.Annotation] = []
+    overlapping_doc_annotations: List[schemas.Annotation] = []
 
     if harmonize_with_doc_ents and len(doc_ents_idx) == 0:
         return [], []
diff --git a/benchmarks/nel/scripts/evaluate.py b/benchmarks/nel/scripts/evaluate.py
index e56b4c2da..3dc1bf242 100644
--- a/benchmarks/nel/scripts/evaluate.py
+++ b/benchmarks/nel/scripts/evaluate.py
@@ -4,14 +4,14 @@
 from custom_functions import create_candidates_via_embeddings
 
 
-def main(dataset_name: str, run_name: str):
+def main(dataset_name: str, run_name: str, language: str):
     """Evaluate the trained EL component by applying it to unseen text.
     dataset_name (str): Name of dataset to evaluate on.
     run_name (str): Run name.
+    language (str): Language.
     """
-    Dataset.generate_from_id(dataset_name, run_name).evaluate(run_name=run_name)
+    Dataset.generate_from_id(dataset_name, language, run_name).evaluate(run_name=run_name)
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "cg-embedding")
-    # typer.run(main)
+    typer.run(main)
diff --git a/benchmarks/nel/scripts/parse_corpus.py b/benchmarks/nel/scripts/parse_corpus.py
new file mode 100644
index 000000000..9e433cd5a
--- /dev/null
+++ b/benchmarks/nel/scripts/parse_corpus.py
@@ -0,0 +1,16 @@
+""" Parse corpus. """
+from datasets.dataset import Dataset
+import typer
+
+
+def main(dataset_name: str, run_name: str, language: str):
+    """Parse corpus.
+    dataset_name (str): Name of dataset to evaluate on.
+    run_name (str): Run name.
+    language (str): Language.
+    """
+    Dataset.generate_from_id(dataset_name, language, run_name).parse_corpus(run_name=run_name)
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/benchmarks/nel/scripts/schemas.py b/benchmarks/nel/scripts/schemas.py
deleted file mode 100644
index 52ae4f4ef..000000000
--- a/benchmarks/nel/scripts/schemas.py
+++ /dev/null
@@ -1,28 +0,0 @@
-""" Schemas for types used in this project. """
-
-from typing import Set, Optional
-
-from pydantic.fields import Field
-from pydantic.main import BaseModel
-from pydantic.types import StrictInt
-
-
-class Entity(BaseModel):
-    """Schema for single entity."""
-
-    qid: str = Field(..., title="Wiki QID.")
-    name: str = Field(..., title="Entity name.")
-    aliases: Set[str] = Field(..., title="All found aliases.")
-    count: StrictInt = Field(0, title="Count in Wiki corpus.")
-    description: Optional[str] = Field(None, title="Full description.")
-    article_title: Optional[str] = Field(None, title="Article title.")
-    article_text: Optional[str] = Field(None, title="Article text.")
-
-
-class Annotation(BaseModel):
-    """Schema for single annotation."""
-
-    entity_name: str = Field(..., title="Entity name.")
-    entity_id: Optional[str] = Field(None, title="Entity ID.")
-    start_pos: StrictInt = Field(..., title="Start character position.")
-    end_pos: StrictInt = Field(..., title="End character position.")
diff --git a/benchmarks/nel/scripts/train.sh b/benchmarks/nel/scripts/train.sh
index b9208d6df..6a615ecab 100644
--- a/benchmarks/nel/scripts/train.sh
+++ b/benchmarks/nel/scripts/train.sh
@@ -1,20 +1,22 @@
 #!/bin/bash
 
-gpu_id="${5:--1}"
+gpu_id="${6:--1}"
 
 # Runs training. Expects as arguments:
 #   (1) dataset ID,
 #   (2) run name,
-#   (3) config file name,
-#   (4) max. steps.
-#   (5) GPU information if GPU is to be used.
-PYTHONPATH=scripts python -m spacy train configs/$3 \
+#   (3) language,
+#   (4) config file name,
+#   (5) max. steps.
+#   (6) GPU information if GPU is to be used.
+PYTHONPATH=scripts python -m spacy train configs/$4 \
           --paths.dataset_name $1 \
           --output training/$1/$2 \
           --paths.train corpora/$1/train.spacy \
           --paths.dev corpora/$1/dev.spacy \
-          --paths.kb temp/$1/kb \
-          --paths.base_nlp temp/$1/nlp \
-          --training.max_steps $4 \
+          --paths.kb wikid/output/$3/kb \
+          --paths.base_nlp wikid/output/$3/nlp \
+          --paths.language $3 \
+          --training.max_steps $5 \
           -c scripts/custom_functions.py \
           --gpu-id $gpu_id
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/utils.py b/benchmarks/nel/scripts/utils.py
index 5864c4df9..872c83094 100644
--- a/benchmarks/nel/scripts/utils.py
+++ b/benchmarks/nel/scripts/utils.py
@@ -18,11 +18,3 @@ def get_logger(handle: str) -> logging.Logger:
     """
 
     return logging.getLogger(handle)
-
-
-def read_filter_terms() -> Set[str]:
-    """Read terms used to filter Wiki dumps/corpora.
-    RETURNS (Set[str]): Set of filter terms.
-    """
-    with open(Path(__file__).parent.parent / "configs" / "filter_terms.txt", "r") as file:
-        return {ft.replace("\n", "") for ft in file.readlines()}
diff --git a/benchmarks/nel/temp/.gitignore b/benchmarks/nel/temp/.gitignore
deleted file mode 100644
index e7a210ec7..000000000
--- a/benchmarks/nel/temp/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*
-*/
-!.gitignore
\ No newline at end of file
diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index e64d9e0c9..a875e044c 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -12,11 +12,12 @@ def test_nel_benchmark():
     project_assets(root)
     project_run(root, "download_mewsli9", capture=True)
     project_run(root, "download_model", capture=True)
-    project_run(root, "wiki_clone_wikid", capture=True)
+    project_run(root, "wikid_clone", capture=True)
     project_run(root, "preprocess", capture=True)
-    project_run(root, "wiki_download_assets", capture=True)
-    project_run(root, "wiki_parse", capture=True)
-    project_run(root, "create_kb", capture=True)
+    project_run(root, "wikid_download_assets", capture=True)
+    project_run(root, "wikid_parse", capture=True)
+    project_run(root, "wikid_create_kb", capture=True)
+    project_run(root, "parse_corpus", capture=True)
     project_run(root, "compile_corpora", capture=True)
     project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1})
     project_run(root, "evaluate", capture=True)
diff --git a/pipelines/tagger_parser_ud/configs/default.cfg b/pipelines/tagger_parser_ud/configs/default.cfg
index 43528eafd..17bc80df8 100644
--- a/pipelines/tagger_parser_ud/configs/default.cfg
+++ b/pipelines/tagger_parser_ud/configs/default.cfg
@@ -159,7 +159,7 @@ compound = 1.001
 t = 0.0
 
 [training.logger]
-@loggers = "spacy.ClearMLLogger.v1"
+@loggers = "spacy.ConsoleLogger.v1"
 project_name = "test_project"
 
 [training.optimizer]

From 0dbc93465984637fa73156fb535e6ff25fc7322e Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:16:26 +0200
Subject: [PATCH 10/38] Fix parameter passing to parse_corpus.

---
 benchmarks/nel/project.yml                             | 5 ++---
 benchmarks/nel/scripts/candidate_generation/lexical.py | 2 --
 benchmarks/nel/scripts/parse_corpus.py                 | 5 ++---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 8b3917e9c..680659dcc 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -24,6 +24,7 @@ workflows:
     - wikid_download_assets
     - wikid_parse
     - wikid_create_kb
+    - parse_corpus
     - compile_corpora
     - train
     - evaluate
@@ -85,12 +86,10 @@ commands:
       - "wikid/output/${vars.language}/kb"
       - "wikid/output/${vars.language}/nlp"
 
-    # todo generate annotations - this was done in create_kb in the past. would make sense in compile_corpora?
-
   - name: parse_corpus
     help: "Parse corpus to generate entity and annotation lookups used for corpora compilation."
     script:
-      - "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language} ${vars.vectors_model}"
+      - "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/clean"
       - "wikid/output/${vars.language}/wiki.sqlite3"
diff --git a/benchmarks/nel/scripts/candidate_generation/lexical.py b/benchmarks/nel/scripts/candidate_generation/lexical.py
index 6c8ae8c64..63353a959 100644
--- a/benchmarks/nel/scripts/candidate_generation/lexical.py
+++ b/benchmarks/nel/scripts/candidate_generation/lexical.py
@@ -21,8 +21,6 @@ def _fetch_candidates(
         max_n_candidates: int,
         similarity_cutoff: float = 0.5,
     ) -> Iterable[int]:
-        # todo 1. replace all wiki parsing stuff with wikid
-        # todo 2. review generation of KB, move to wikid
         # todo 3. get rid of entity pickle files (move loading, stats to compile_corpora)
         # todo 4. re-evaluate efficacy of fuzzy string lookup (memory? access time?)
         # todo also: push forward spacy NEL changes - add mechanism for pushing back entity sets instead of single
diff --git a/benchmarks/nel/scripts/parse_corpus.py b/benchmarks/nel/scripts/parse_corpus.py
index 9e433cd5a..1237a6b59 100644
--- a/benchmarks/nel/scripts/parse_corpus.py
+++ b/benchmarks/nel/scripts/parse_corpus.py
@@ -3,13 +3,12 @@
 import typer
 
 
-def main(dataset_name: str, run_name: str, language: str):
+def main(dataset_name: str, language: str):
     """Parse corpus.
     dataset_name (str): Name of dataset to evaluate on.
-    run_name (str): Run name.
     language (str): Language.
     """
-    Dataset.generate_from_id(dataset_name, language, run_name).parse_corpus(run_name=run_name)
+    Dataset.generate_from_id(dataset_name, language, "").parse_corpus()
 
 
 if __name__ == "__main__":

From 6d96f20398fd8cce3e9a16b4b44c036cb60f3cab Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:22:40 +0200
Subject: [PATCH 11/38] Revert unrelated test changes to UD parser config.

---
 pipelines/tagger_parser_ud/configs/default.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/tagger_parser_ud/configs/default.cfg b/pipelines/tagger_parser_ud/configs/default.cfg
index 17bc80df8..964acd530 100644
--- a/pipelines/tagger_parser_ud/configs/default.cfg
+++ b/pipelines/tagger_parser_ud/configs/default.cfg
@@ -160,7 +160,7 @@ t = 0.0
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
-project_name = "test_project"
+progress_bar = false
 
 [training.optimizer]
 @optimizers = "Adam.v1"

From f0cb64a1b981b3f058ebd227565b4b7b6bd7a749 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:23:49 +0200
Subject: [PATCH 12/38] Add comment.

---
 benchmarks/nel/configs/nel.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 5fb7bd579..4532dda35 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -16,6 +16,7 @@ gpu_allocator = null
 
 
 [nlp]
+# Passing "${paths.language}" doesn't work here, unfortunately.
 lang = "en"
 pipeline = ["senter","ner","entity_linker"]
 disabled = []

From e166144ab6184280214ebb9f6caf147211a70866 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:27:12 +0200
Subject: [PATCH 13/38] Remove assets command from test.

---
 benchmarks/nel/test_nel_benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index a875e044c..4491c8418 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -9,7 +9,6 @@
 @pytest.mark.skipif(sys.platform == "win32", reason="Skipping on Windows (for now) due to platform-specific scripts.")
 def test_nel_benchmark():
     root = Path(__file__).parent
-    project_assets(root)
     project_run(root, "download_mewsli9", capture=True)
     project_run(root, "download_model", capture=True)
     project_run(root, "wikid_clone", capture=True)

From dc7977b62d0fa39ee8848d8b228a0bbad7605464 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:27:21 +0200
Subject: [PATCH 14/38] Remove assets command from test.

---
 benchmarks/nel/test_nel_benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index 4491c8418..2e22a5f31 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 import sys
 from spacy.cli.project.run import project_run
-from spacy.cli.project.assets import project_assets
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Skipping on Windows (for now) due to platform-specific scripts.")

From e6fb7fd16fb67d3bab34fe9cee744a6788695df5 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 15:51:27 +0200
Subject: [PATCH 15/38] Change description for 'preprocess' step.

---
 benchmarks/nel/project.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 680659dcc..682de8f1a 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -54,7 +54,7 @@ commands:
       - pip install -r wikid/requirements.txt
 
   - name: preprocess
-    help: Preprocess test datasets.
+    help: Preprocess and clean corpus data.
     script:
       - "env PYTHONPATH=. python ./scripts/clean_data.py ${vars.dataset} ${vars.language}"
     deps:

From dc2e2892054c825173bef86cee89d6e691e28539 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 16:25:54 +0200
Subject: [PATCH 16/38] Fix configuration error.

---
 benchmarks/nel/scripts/candidate_generation/lexical.py | 8 +-------
 benchmarks/nel/scripts/custom_functions.py             | 8 ++++++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/benchmarks/nel/scripts/candidate_generation/lexical.py b/benchmarks/nel/scripts/candidate_generation/lexical.py
index 63353a959..3c5c40155 100644
--- a/benchmarks/nel/scripts/candidate_generation/lexical.py
+++ b/benchmarks/nel/scripts/candidate_generation/lexical.py
@@ -21,15 +21,9 @@ def _fetch_candidates(
         max_n_candidates: int,
         similarity_cutoff: float = 0.5,
     ) -> Iterable[int]:
-        # todo 3. get rid of entity pickle files (move loading, stats to compile_corpora)
-        # todo 4. re-evaluate efficacy of fuzzy string lookup (memory? access time?)
-        # todo also: push forward spacy NEL changes - add mechanism for pushing back entity sets instead of single
-        #  entities - how?
-
-        hits = self._lookup_struct.get(span.text, [])
         all_cands = [
             kb.get_alias_candidates(entry[1]) for entry in self._lookup_struct.get(span.text, [])
             if entry[0] >= similarity_cutoff
         ][:max_n_candidates]
-        x = 3
+
         return {cand for cands_for_alias in all_cands for cand in cands_for_alias}
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
index d851a137e..2ecb0e753 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -16,10 +16,11 @@
 
 @spacy.registry.misc("EmbeddingGetCandidates.v1")
 def create_candidates_via_embeddings(
-    dataset_name: str, max_n_candidates: int, lexical_similarity_cutoff: float
+    dataset_name: str, language: str, max_n_candidates: int, lexical_similarity_cutoff: float
 ) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
     """Returns Callable for identification of candidates via their embeddings.
     dataset_name (str): Dataset name.
+    langugage (str): Language.
     max_n_candidates (int): Numbers of nearest neighbours to query.
     RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates.
     """
@@ -30,6 +31,7 @@ def create_candidates_via_embeddings(
         partial(
             embedding_candidate_selector,
             dataset_id=dataset_name,
+            language=language,
             max_n_candidates=max_n_candidates,
             lexical_similarity_cutoff=lexical_similarity_cutoff,
         ),
@@ -38,10 +40,11 @@ def create_candidates_via_embeddings(
 
 @spacy.registry.misc("FuzzyStringGetCandidates.v1")
 def create_candidates_via_fuzzy_string_matching(
-    dataset_name: str, max_n_candidates: int, similarity_cutoff: float
+    dataset_name: str, language: str, max_n_candidates: int, similarity_cutoff: float
 ) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
     """Returns Callable for identification of candidates via NN search in lexical space.
     dataset_name (str): Dataset name.
+    langugage (str): Language.
     max_n_candidates (int): Numbers of nearest neighbours to query.
     similarity_cutoff (float): Similarity value below which candidates won't be included.
     RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates.
@@ -55,6 +58,7 @@ def create_candidates_via_fuzzy_string_matching(
         partial(
             fuzzy_lexical_candidate_selector,
             dataset_id=dataset_name,
+            language=language,
             max_n_candidates=max_n_candidates,
             similarity_cutoff=similarity_cutoff,
         ),

From 5989cc17ec73081aea7cb129f1c72ba0a71e6cd6 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 16:46:24 +0200
Subject: [PATCH 17/38] Remove max_epochs limit.

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index a73de52ed..0afe2abef 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -7,7 +7,7 @@ trigger:
 
 variables:
     # Make sure we're exiting training as early as possible
-    SPACY_CONFIG_OVERRIDES: '--training.max_epochs=1 --training.max_steps=1'
+    SPACY_CONFIG_OVERRIDES: '--training.max_steps=1'
     WASABI_LOG_FRIENDLY: 1
 
 jobs:

From 5e5e1fd6cc8da7b585342dfc3307ce7074a17517 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 26 Oct 2022 17:07:32 +0200
Subject: [PATCH 18/38] Remove comment in nel.cfg. Reintroduce epochs override
 in CI.

---
 azure-pipelines.yml            | 2 +-
 benchmarks/nel/configs/nel.cfg | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0afe2abef..a73de52ed 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -7,7 +7,7 @@ trigger:
 
 variables:
     # Make sure we're exiting training as early as possible
-    SPACY_CONFIG_OVERRIDES: '--training.max_steps=1'
+    SPACY_CONFIG_OVERRIDES: '--training.max_epochs=1 --training.max_steps=1'
     WASABI_LOG_FRIENDLY: 1
 
 jobs:
diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 4532dda35..5fb7bd579 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -16,7 +16,6 @@ gpu_allocator = null
 
 
 [nlp]
-# Passing "${paths.language}" doesn't work here, unfortunately.
 lang = "en"
 pipeline = ["senter","ner","entity_linker"]
 disabled = []

From 32ab441a079242cffc133d3bae3ee9efc217437b Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 27 Oct 2022 10:38:01 +0200
Subject: [PATCH 19/38] Disable overrides temporarily for wikid calls.

---
 benchmarks/nel/test_nel_benchmark.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index 2e22a5f31..02f21b60a 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -1,4 +1,6 @@
 """ Testing all project steps. """
+import os
+
 import pytest
 from pathlib import Path
 import sys
@@ -12,11 +14,15 @@ def test_nel_benchmark():
     project_run(root, "download_model", capture=True)
     project_run(root, "wikid_clone", capture=True)
     project_run(root, "preprocess", capture=True)
+    overrides_key = "SPACY_CONFIG_OVERRIDES"
+    overrides = os.environ.pop(overrides_key) if overrides_key in os.environ else None
     project_run(root, "wikid_download_assets", capture=True)
     project_run(root, "wikid_parse", capture=True)
     project_run(root, "wikid_create_kb", capture=True)
+    if overrides:
+        os.environ[overrides_key] = overrides
     project_run(root, "parse_corpus", capture=True)
     project_run(root, "compile_corpora", capture=True)
-    project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1})
+    project_run(root, "train", capture=True)
     project_run(root, "evaluate", capture=True)
     project_run(root, "compare_evaluations", capture=True)

From aaa40b759dffd30c0e955b729930be4b2bb7da01 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 27 Oct 2022 11:05:49 +0200
Subject: [PATCH 20/38] Clean up test.

---
 benchmarks/nel/test_nel_benchmark.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index 02f21b60a..b730df211 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -9,16 +9,20 @@
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Skipping on Windows (for now) due to platform-specific scripts.")
 def test_nel_benchmark():
+    overrides_key = "SPACY_CONFIG_OVERRIDES"
     root = Path(__file__).parent
+
     project_run(root, "download_mewsli9", capture=True)
     project_run(root, "download_model", capture=True)
     project_run(root, "wikid_clone", capture=True)
     project_run(root, "preprocess", capture=True)
-    overrides_key = "SPACY_CONFIG_OVERRIDES"
-    overrides = os.environ.pop(overrides_key) if overrides_key in os.environ else None
+    # Temporarily disable override env variables, since these may result in config validation errors in this
+    # project-in-project situation.
+    overrides = os.environ.pop(overrides_key, None)
     project_run(root, "wikid_download_assets", capture=True)
     project_run(root, "wikid_parse", capture=True)
     project_run(root, "wikid_create_kb", capture=True)
+    # Re-enable config overrides, if set before.
     if overrides:
         os.environ[overrides_key] = overrides
     project_run(root, "parse_corpus", capture=True)

From 8ff709514d77895fe03dd7d88368bc6e2646d382 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 27 Oct 2022 11:21:04 +0200
Subject: [PATCH 21/38] Readd overrides in test.

---
 benchmarks/nel/test_nel_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index b730df211..54b26d217 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -27,6 +27,6 @@ def test_nel_benchmark():
         os.environ[overrides_key] = overrides
     project_run(root, "parse_corpus", capture=True)
     project_run(root, "compile_corpora", capture=True)
-    project_run(root, "train", capture=True)
+    project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1, "vars.training_max_epochs": 1})
     project_run(root, "evaluate", capture=True)
     project_run(root, "compare_evaluations", capture=True)

From df678800b9df69676f34cf102bdb8d7e98229010 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 27 Oct 2022 12:09:23 +0200
Subject: [PATCH 22/38] Set default CG to reduce test time.

---
 benchmarks/nel/configs/nel.cfg | 8 +-------
 benchmarks/nel/project.yml     | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 5fb7bd579..5d69df4d7 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -40,13 +40,7 @@ entity_vector_length = 64
 incl_context = true
 incl_prior = true
 labels_discard = []
-
-[components.entity_linker.get_candidates]
-@misc = "FuzzyStringGetCandidates.v1"
-dataset_name = "${paths.dataset_name}"
-language = "${paths.language}"
-max_n_candidates = 50
-similarity_cutoff = 0.5
+get_candidates = {"@misc": "spacy.CandidateGenerator.v1}
 
 [components.entity_linker.model]
 @architectures = "spacy.EntityLinker.v1"
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 682de8f1a..561b1ea9e 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -1,7 +1,7 @@
 title: 'NEL Benchmark'
 description: "Pipeline for benchmarking NEL approaches (incl. candidate generation and entity disambiguation)."
 vars:
-  run: "cg-lexical"
+  run: "cg-default"
   language: "en"
   config: "nel.cfg"
   vectors_model: "en_core_web_lg"

From 8e1bdc942be95525793f65e3d38707989e852006 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 27 Oct 2022 13:00:15 +0200
Subject: [PATCH 23/38] Fix typo.

---
 benchmarks/nel/configs/nel.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 5d69df4d7..ffc612369 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -40,7 +40,7 @@ entity_vector_length = 64
 incl_context = true
 incl_prior = true
 labels_discard = []
-get_candidates = {"@misc": "spacy.CandidateGenerator.v1}
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
 
 [components.entity_linker.model]
 @architectures = "spacy.EntityLinker.v1"

From 40f66eb39738ad13ca7b8f7f478d3b0aaf432424 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 28 Oct 2022 12:43:02 +0200
Subject: [PATCH 24/38] Update wikid repo URL.

---
 benchmarks/nel/project.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 561b1ea9e..082e9eb35 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -49,8 +49,7 @@ commands:
   - name: wikid_clone
     help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
     script:
-      # To be updated after merge.
-      - git clone https://github.com/rmitsch/wikid.git --branch feature/wiki-parsing
+      - git clone https://github.com/explosion/wikid.git --branch main
       - pip install -r wikid/requirements.txt
 
   - name: preprocess

From 2ed7900a03fa41e19da4d9f1e55d1c25be01ab0b Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 28 Oct 2022 13:06:29 +0200
Subject: [PATCH 25/38] Remove test code.

---
 benchmarks/nel/scripts/datasets/dataset.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index e953a17e8..9a139c5e7 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -282,16 +282,8 @@ def evaluate(self, run_name: str) -> None:
                         ent_offset = (ent.start_char, ent.end_char)
                         # For the candidate generation evaluation also mis-aligned entities are considered.
                         label = ent_pred_labels.get(ent_offset, "NIL")
-                        a_cands = ent_cands.get(ent_offset, {})
-                        a_cands_aliases = {v.alias_ for c, v in ent_cands.get(ent_offset, {}).items()}
-                        a_cands_kb_ids = {v.entity_ for c, v in ent_cands.get(ent_offset, {}).items()}
-                        a_kb_id = ent.kb_id_
-                        a_ent = ent
-                        a_mention = example.reference.text[ent.start_char:ent.end_char]
                         cand_gen_label_counts[label] += 1
                         candidate_results.update_metrics(label, ent.kb_id_, set(ent_cands.get(ent_offset, {}).keys()))
-                        if len(a_cands_kb_ids) > 0 and ent.kb_id_ not in a_cands_kb_ids:
-                            x = 3
 
                 # Update entity disambiguation stats for baselines.
                 evaluation.add_disambiguation_baseline(

From 2051342af201cf86de9119e59930d77e0f713553 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 21 Nov 2022 17:17:08 +0100
Subject: [PATCH 26/38] Start transition towards WikiKB.

---
 benchmarks/nel/project.yml                    | 32 ++++-----
 .../scripts/candidate_generation/__init__.py  |  0
 .../nel/scripts/candidate_generation/base.py  | 70 -------------------
 .../candidate_generation/embeddings.py        | 53 --------------
 .../scripts/candidate_generation/lexical.py   | 29 --------
 .../{clean_data.py => cli_clean_data.py}      |  0
 ...luations.py => cli_compare_evaluations.py} |  0
 ...pile_corpora.py => cli_compile_corpora.py} |  0
 .../scripts/{evaluate.py => cli_evaluate.py}  |  3 +-
 ...e_corpus.py => cli_extract_annotations.py} |  2 +-
 benchmarks/nel/scripts/custom_functions.py    | 65 -----------------
 benchmarks/nel/scripts/datasets/dataset.py    | 53 ++++----------
 benchmarks/nel/scripts/datasets/mewsli_9.py   | 19 ++---
 benchmarks/nel/test_nel_benchmark.py          |  3 +-
 14 files changed, 44 insertions(+), 285 deletions(-)
 delete mode 100644 benchmarks/nel/scripts/candidate_generation/__init__.py
 delete mode 100644 benchmarks/nel/scripts/candidate_generation/base.py
 delete mode 100644 benchmarks/nel/scripts/candidate_generation/embeddings.py
 delete mode 100644 benchmarks/nel/scripts/candidate_generation/lexical.py
 rename benchmarks/nel/scripts/{clean_data.py => cli_clean_data.py} (100%)
 rename benchmarks/nel/scripts/{compare_evaluations.py => cli_compare_evaluations.py} (100%)
 rename benchmarks/nel/scripts/{compile_corpora.py => cli_compile_corpora.py} (100%)
 rename benchmarks/nel/scripts/{evaluate.py => cli_evaluate.py} (89%)
 rename benchmarks/nel/scripts/{parse_corpus.py => cli_extract_annotations.py} (78%)
 delete mode 100644 benchmarks/nel/scripts/custom_functions.py

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 082e9eb35..0dcd87b11 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -15,6 +15,8 @@ vars:
 
 directories: ["assets", "training", "configs", "scripts", "corpora", "evaluation"]
 
+check_requirements: True
+
 workflows:
   all:
     - download_mewsli9
@@ -24,7 +26,7 @@ workflows:
     - wikid_download_assets
     - wikid_parse
     - wikid_create_kb
-    - parse_corpus
+    - extract_annotations
     - compile_corpora
     - train
     - evaluate
@@ -47,15 +49,15 @@ commands:
       - "python -m spacy download ${vars.vectors_model}"
 
   - name: wikid_clone
-    help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
+    help: "Clone `wikid` to prepare Wiki database and `KnowledgeBase`."
     script:
-      - git clone https://github.com/explosion/wikid.git --branch main
-      - pip install -r wikid/requirements.txt
+      - "git clone https://github.com/rmitsch/wikid.git --branch feature/kb
+      - "pip install -r wikid/requirements.txt"
 
   - name: preprocess
     help: Preprocess and clean corpus data.
     script:
-      - "env PYTHONPATH=. python ./scripts/clean_data.py ${vars.dataset} ${vars.language}"
+      - "env PYTHONPATH=. python ./scripts/cli_clean_data.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/raw"
     outputs:
@@ -71,39 +73,35 @@ commands:
   - name: wikid_parse
     help: "Parse Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
     script:
-      - "spacy project run parse wikid --vars.language ${vars.language} --vars.filter True"
+      - "spacy project run parse wikid --vars.language ${vars.language} --vars.filter ${vars.filter}"
     outputs:
       - "wikid/output/${vars.language}/wiki.sqlite3"
 
   - name: wikid_create_kb
     help: "Create the knowledge base and write it to file."
     script:
-      - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model}"
+      - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model} --force"
     deps:
       - "wikid/output/${vars.language}/wiki.sqlite3"
     outputs_no_cache:
       - "wikid/output/${vars.language}/kb"
       - "wikid/output/${vars.language}/nlp"
 
-  - name: parse_corpus
-    help: "Parse corpus to generate entity and annotation lookups used for corpora compilation."
+  - name: extract_annotations
+    help: "Extract annotations from corpus."
     script:
-      - "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language}"
+      - "env PYTHONPATH=. python ./scripts/cli_extract_annotations.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/clean"
       - "wikid/output/${vars.language}/wiki.sqlite3"
     outputs:
-      - "assets/${vars.dataset}/entities.pkl"
-      - "assets/${vars.dataset}/entities_failed_lookup.pkl"
       - "assets/${vars.dataset}/annotations.pkl"
 
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
-      - "env PYTHONPATH=. python ./scripts/compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}"
+      - "env PYTHONPATH=. python ./scripts/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}"
     deps:
-      - "assets/${vars.dataset}/entities.pkl"
-      - "assets/${vars.dataset}/entities_failed_lookups.pkl"
       - "assets/${vars.dataset}/annotations.pkl"
       - "wikid/output/${vars.language}/kb"
       - "wikid/output/${vars.language}/nlp"
@@ -128,7 +126,7 @@ commands:
   - name: evaluate
     help: "Evaluate on the test set."
     script:
-      - "env PYTHONPATH=. python ./scripts/evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
+      - "env PYTHONPATH=. python ./scripts/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
     deps:
       - "training/${vars.dataset}/${vars.run}/model-best"
       - "wikid/output/${vars.language}/nlp"
@@ -139,7 +137,7 @@ commands:
   - name: compare_evaluations
     help: "Compare available set of evaluation runs."
     script:
-      - "env PYTHONPATH=. python ./scripts/compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
+      - "env PYTHONPATH=. python ./scripts/cli_compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
     deps:
       - "evaluation/${vars.dataset}"
 
diff --git a/benchmarks/nel/scripts/candidate_generation/__init__.py b/benchmarks/nel/scripts/candidate_generation/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmarks/nel/scripts/candidate_generation/base.py b/benchmarks/nel/scripts/candidate_generation/base.py
deleted file mode 100644
index 38f25d035..000000000
--- a/benchmarks/nel/scripts/candidate_generation/base.py
+++ /dev/null
@@ -1,70 +0,0 @@
-""" Base class generation for candidate selection. """
-import abc
-import pickle
-from typing import Dict, Any, Optional, Iterable, Tuple
-
-import spacy
-from spacy import Language
-from spacy.kb import KnowledgeBase, Candidate
-from spacy.tokens import Span
-
-from datasets.dataset import Dataset
-
-
-class NearestNeighborCandidateSelector(abc.ABC):
-    """Callable object selecting candidates via nearest neighbour search."""
-
-    _pipeline: Optional[Language] = None
-    _lookup_struct: Optional[Any] = None
-    _entities: Dict[str, Any] = {}
-
-    def __call__(
-        self, kb: KnowledgeBase, span: Span, dataset_id: str, language: str, max_n_candidates: int, **kwargs
-    ) -> Iterable[Candidate]:
-        """Identifies entity candidates.
-        kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates.
-        span (Span): Span to match potential entity candidates with.
-        dataset_id (str): ID of dataset for which to select candidates.
-        language (str): Language.
-        max_n_candidates (int): Numbers of nearest neighbours to query.
-        RETURNS (Iterator[Candidate]): Candidates for specified entity.
-        """
-
-        if self._pipeline is None:
-            # Load pipeline and pickled entities. Run name doesn't matter for either of those.
-            paths = Dataset.assemble_paths(dataset_id, "", language)
-            self._pipeline = spacy.load(paths["nlp_base"])
-            with open(paths["entities"], "rb") as file:
-                self._entities[dataset_id] = pickle.load(file)
-        if self._lookup_struct is None:
-            self._lookup_struct = self._init_lookup_structure(kb, max_n_candidates, **kwargs)
-
-        # Retrieve candidates from KB.
-        return self._fetch_candidates(dataset_id, span, kb, max_n_candidates, **kwargs)
-
-    @abc.abstractmethod
-    def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> Any:
-        """Init container for lookups for new dataset. Doesn't do anything if initialized for this dataset already.
-        kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates.
-        max_n_candidates (int): Max. number of candidates to generate.
-        RETURNS (Any): Initialized container.
-        """
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def _fetch_candidates(
-        self,
-        dataset_id: str,
-        span: Span,
-        kb: KnowledgeBase,
-        max_n_candidates: int,
-        **kwargs
-    ) -> Iterable[Candidate]:
-        """Fetches candidates for entity in span.text.
-        dataset_id (str): ID of dataset for which to select candidates.
-        span (Span): candidate span.
-        kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates.
-        max_n_candidates (int): Max. number of candidates to generate.
-        RETURNS (Iterator[Candidate]): Candidates for specified entity.
-        """
-        raise NotImplementedError
diff --git a/benchmarks/nel/scripts/candidate_generation/embeddings.py b/benchmarks/nel/scripts/candidate_generation/embeddings.py
deleted file mode 100644
index bd4f49987..000000000
--- a/benchmarks/nel/scripts/candidate_generation/embeddings.py
+++ /dev/null
@@ -1,53 +0,0 @@
-""" Candidate generation via distance in embedding space. """
-import time
-from typing import Iterable, List, Set
-
-import numpy
-from sklearn.neighbors import NearestNeighbors
-
-from spacy.kb import KnowledgeBase
-from spacy.tokens import Span
-from .base import NearestNeighborCandidateSelector
-from rapidfuzz.string_metric import normalized_levenshtein
-
-
-class EmbeddingCandidateSelector(NearestNeighborCandidateSelector):
-    """Callable object selecting candidates as nearest neighbours in embedding space."""
-
-    _entity_ids: List[str] = []
-
-    def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> NearestNeighbors:
-        container = NearestNeighbors(n_neighbors=max_n_candidates, metric="cosine", n_jobs=1)
-        container.fit(numpy.asarray([kb.get_vector(ent_id) for ent_id in kb.get_entity_strings()]))
-        self._entity_ids = kb.get_entity_strings()
-
-        return container
-
-    def _fetch_candidates(
-        self,
-        dataset_id: str,
-        span: Span,
-        kb: KnowledgeBase,
-        max_n_candidates: int,
-        lexical_similarity_cutoff: float = 0.5,
-    ) -> Iterable[int]:
-        target_vec = span.vector
-        if not isinstance(target_vec, numpy.ndarray):
-            target_vec = target_vec.get()
-
-        nn_idx = self._lookup_struct.kneighbors(target_vec.reshape((1, -1)))[1][0]
-        nn_entities = {self._entity_ids[i]: self._entities[dataset_id][self._entity_ids[i]] for i in nn_idx}
-        candidate_entity_ids: Set[str] = set()
-        for nne in nn_entities:
-            for name in nn_entities[nne].aliases:
-                if normalized_levenshtein(name.lower(), span.text.lower()) / 100 >= lexical_similarity_cutoff:
-                    candidate_entity_ids.add(nne)
-                    break
-
-        return {
-            cand
-            for cands_for_alias in [
-                kb.get_alias_candidates("_" + cei + "_") for cei in candidate_entity_ids
-            ]
-            for cand in cands_for_alias
-        }
diff --git a/benchmarks/nel/scripts/candidate_generation/lexical.py b/benchmarks/nel/scripts/candidate_generation/lexical.py
deleted file mode 100644
index 3c5c40155..000000000
--- a/benchmarks/nel/scripts/candidate_generation/lexical.py
+++ /dev/null
@@ -1,29 +0,0 @@
-""" Candidate generation via distance in lexical space. """
-from typing import Iterable
-
-from spacy.kb import KnowledgeBase, Candidate
-from spacy.tokens import Span
-from .base import NearestNeighborCandidateSelector
-from cfuzzyset import cFuzzySet as FuzzySet
-
-
-class LexicalCandidateSelector(NearestNeighborCandidateSelector):
-    """Callable object selecting candidates as nearest neighbours in lexical space."""
-
-    def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> FuzzySet:
-        return FuzzySet(kb.get_alias_strings())
-
-    def _fetch_candidates(
-        self,
-        dataset_id: str,
-        span: Span,
-        kb: KnowledgeBase,
-        max_n_candidates: int,
-        similarity_cutoff: float = 0.5,
-    ) -> Iterable[int]:
-        all_cands = [
-            kb.get_alias_candidates(entry[1]) for entry in self._lookup_struct.get(span.text, [])
-            if entry[0] >= similarity_cutoff
-        ][:max_n_candidates]
-
-        return {cand for cands_for_alias in all_cands for cand in cands_for_alias}
diff --git a/benchmarks/nel/scripts/clean_data.py b/benchmarks/nel/scripts/cli_clean_data.py
similarity index 100%
rename from benchmarks/nel/scripts/clean_data.py
rename to benchmarks/nel/scripts/cli_clean_data.py
diff --git a/benchmarks/nel/scripts/compare_evaluations.py b/benchmarks/nel/scripts/cli_compare_evaluations.py
similarity index 100%
rename from benchmarks/nel/scripts/compare_evaluations.py
rename to benchmarks/nel/scripts/cli_compare_evaluations.py
diff --git a/benchmarks/nel/scripts/compile_corpora.py b/benchmarks/nel/scripts/cli_compile_corpora.py
similarity index 100%
rename from benchmarks/nel/scripts/compile_corpora.py
rename to benchmarks/nel/scripts/cli_compile_corpora.py
diff --git a/benchmarks/nel/scripts/evaluate.py b/benchmarks/nel/scripts/cli_evaluate.py
similarity index 89%
rename from benchmarks/nel/scripts/evaluate.py
rename to benchmarks/nel/scripts/cli_evaluate.py
index 3dc1bf242..36e4a09b1 100644
--- a/benchmarks/nel/scripts/evaluate.py
+++ b/benchmarks/nel/scripts/cli_evaluate.py
@@ -14,4 +14,5 @@ def main(dataset_name: str, run_name: str, language: str):
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    main("mewsli_9", "cg-lexical", "en")
+    # typer.run(main)
diff --git a/benchmarks/nel/scripts/parse_corpus.py b/benchmarks/nel/scripts/cli_extract_annotations.py
similarity index 78%
rename from benchmarks/nel/scripts/parse_corpus.py
rename to benchmarks/nel/scripts/cli_extract_annotations.py
index 1237a6b59..6731f8546 100644
--- a/benchmarks/nel/scripts/parse_corpus.py
+++ b/benchmarks/nel/scripts/cli_extract_annotations.py
@@ -8,7 +8,7 @@ def main(dataset_name: str, language: str):
     dataset_name (str): Name of dataset to evaluate on.
     language (str): Language.
     """
-    Dataset.generate_from_id(dataset_name, language, "").parse_corpus()
+    Dataset.generate_from_id(dataset_name, language, "").extract_annotations()
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
deleted file mode 100644
index 2ecb0e753..000000000
--- a/benchmarks/nel/scripts/custom_functions.py
+++ /dev/null
@@ -1,65 +0,0 @@
-""" Custom functions to be hooked up into the registry. """
-from functools import partial
-
-from typing import Iterable, Callable
-import typing
-import spacy
-from spacy.kb import Candidate, KnowledgeBase
-from spacy.tokens import Span
-
-from scripts.candidate_generation import embeddings
-from scripts.candidate_generation import lexical
-
-embedding_candidate_selector = embeddings.EmbeddingCandidateSelector()
-fuzzy_lexical_candidate_selector = lexical.LexicalCandidateSelector()
-
-
-@spacy.registry.misc("EmbeddingGetCandidates.v1")
-def create_candidates_via_embeddings(
-    dataset_name: str, language: str, max_n_candidates: int, lexical_similarity_cutoff: float
-) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
-    """Returns Callable for identification of candidates via their embeddings.
-    dataset_name (str): Dataset name.
-    langugage (str): Language.
-    max_n_candidates (int): Numbers of nearest neighbours to query.
-    RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates.
-    """
-
-    # More elegant way to enforce proper typing for partial object?
-    return typing.cast(
-        Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        partial(
-            embedding_candidate_selector,
-            dataset_id=dataset_name,
-            language=language,
-            max_n_candidates=max_n_candidates,
-            lexical_similarity_cutoff=lexical_similarity_cutoff,
-        ),
-    )
-
-
-@spacy.registry.misc("FuzzyStringGetCandidates.v1")
-def create_candidates_via_fuzzy_string_matching(
-    dataset_name: str, language: str, max_n_candidates: int, similarity_cutoff: float
-) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
-    """Returns Callable for identification of candidates via NN search in lexical space.
-    dataset_name (str): Dataset name.
-    langugage (str): Language.
-    max_n_candidates (int): Numbers of nearest neighbours to query.
-    similarity_cutoff (float): Similarity value below which candidates won't be included.
-    RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates.
-    """
-
-    assert 0 <= similarity_cutoff <= 1
-
-    # More elegant way to enforce proper typing for partial object?
-    return typing.cast(
-        Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        partial(
-            fuzzy_lexical_candidate_selector,
-            dataset_id=dataset_name,
-            language=language,
-            max_n_candidates=max_n_candidates,
-            similarity_cutoff=similarity_cutoff,
-        ),
-    )
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index 9a139c5e7..aa932ebc2 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -47,8 +47,6 @@ def __init__(self, run_name: str, language: str):
         with open(self._paths["root"] / "configs" / "datasets.yml", "r") as stream:
             self._options = yaml.safe_load(stream)[self.name]
 
-        self._entities: Optional[Dict[str, schemas.Entity]] = None
-        self._failed_entity_lookups: Optional[Set[str]] = None
         self._annotations: Optional[Dict[str, List[schemas.Annotation]]] = None
         self._kb: Optional[KnowledgeBase] = None
         self._nlp_base: Optional[Language] = None
@@ -74,8 +72,6 @@ def assemble_paths(dataset_name: str, run_name: str, language: str) -> Dict[str,
             "assets": assets_path,
             "nlp_base": wikid_path / language / "nlp",
             "kb": wikid_path / language / "kb",
-            "entities": assets_path / "entities.pkl",
-            "failed_entity_lookups": assets_path / "entities_failed_lookups.pkl",
             "annotations": assets_path / "annotations.pkl",
             "nlp_best": root_path / "training" / dataset_name / run_name / "model-best",
             "corpora": root_path / "corpora" / dataset_name
@@ -91,12 +87,10 @@ def compile_corpora(self, filter_terms: Optional[Set[str]] = None) -> None:
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
         """
-        self._load_resource("entities")
-        self._load_resource("failed_entity_lookups")
         self._load_resource("annotations")
         self._load_resource("nlp_base")
         Doc.set_extension("overlapping_annotations", default=None)
-        self._annotated_docs = self._create_annotated_docs(filter_terms)
+        self._annotated_docs = self._create_annotated_docs(filter_terms)[:500]
         self._serialize_corpora()
 
     def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
@@ -107,31 +101,19 @@ def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> Lis
         """
         raise NotImplementedError
 
-    def parse_corpus(self, **kwargs) -> None:
-        """Parses corpus. Loads data on entities and mentions.
-        Populates self._entities, self._failed_entity_lookups, self._annotations.
-        RETURNS (Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]): entities, titles of failed entity
-            lookups, annotations.
+    def extract_annotations(self, **kwargs) -> None:
+        """Parses corpus and extracts annotations. Loads data on entities and mentions.
+        Populates self._annotations.
         """
         self._load_resource("nlp_base")
-        logger.info("Parsing external corpus")
-        (
-            self._entities,
-            self._failed_entity_lookups,
-            self._annotations,
-        ) = self._parse_corpus(**kwargs)
-
-        # Serialize entity information.
-        for to_serialize in (
-            (self._paths["entities"], self._entities),
-            (self._paths["failed_entity_lookups"], self._failed_entity_lookups),
-            (self._paths["annotations"], self._annotations),
-        ):
-            with open(to_serialize[0], "wb") as fp:
-                pickle.dump(to_serialize[1], fp)
-        logger.info("Successfully parsed corpus.")
-
-    def _parse_corpus(
+        logger.info("Extracting annotations from corpus")
+        self._annotations = self._extract_annotations_from_corpus(**kwargs)
+        with open(self._paths["annotations"], "wb") as fp:
+            pickle.dump(self._annotations, fp)
+
+        logger.info("Successfully extracted annotations from corpus.")
+
+    def _extract_annotations_from_corpus(
             self, **kwargs
     ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]:
         """Parses corpus. Loads data on entities and mentions.
@@ -196,14 +178,6 @@ def _load_resource(self, key: str, force: bool = False) -> None:
         elif key == "annotations" and (force or not self._annotations):
             with open(path, "rb") as file:
                 self._annotations = pickle.load(file)
-        elif key == "entities" and (force or not self._entities):
-            with open(path, "rb") as file:
-                self._entities = pickle.load(file)
-        elif key == "failed_entity_lookups" and (
-            force or not self._failed_entity_lookups
-        ):
-            with open(self._paths["failed_entity_lookups"], "rb") as file:
-                self._failed_entity_lookups = pickle.load(file)
 
     def evaluate(self, run_name: str) -> None:
         """Evaluates trained pipeline on test set.
@@ -236,6 +210,9 @@ def evaluate(self, run_name: str) -> None:
                 for ent in doc.ents
             ]
 
+        for doc in docs:
+            pred_doc = self._nlp_best(doc)
+
         test_set = [
             Example(predicted_doc, doc)
             for predicted_doc, doc in zip(
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
index 1623f73c8..55b980ee6 100644
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ b/benchmarks/nel/scripts/datasets/mewsli_9.py
@@ -1,6 +1,7 @@
 """ Dataset class for Mewsli-9 dataset. """
 import csv
 import distutils.dir_util
+import time
 from typing import Tuple, Set, List, Dict, Optional
 
 import tqdm
@@ -8,7 +9,7 @@
 
 from datasets.dataset import Dataset
 from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation
-from wikid import schemas
+from wikid import schemas, load_entities
 
 
 class Mewsli9Dataset(Dataset):
@@ -18,10 +19,9 @@ class Mewsli9Dataset(Dataset):
     def name(self) -> str:
         return "mewsli_9"
 
-    def _parse_corpus(
+    def _extract_annotations_from_corpus(
         self, **kwargs
-    ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]:
-        entity_qids: Set[str] = set()
+    ) -> Dict[str, List[schemas.Annotation]]:
         annotations: Dict[str, List[schemas.Annotation]] = {}
 
         with open(
@@ -30,7 +30,6 @@ def _parse_corpus(
             for i, row in enumerate(csv.DictReader(file_path, delimiter="\t")):
                 assert len(row) == 9
 
-                entity_qids.add(row["qid"])
                 if row["docid"] not in annotations:
                     annotations[row["docid"]] = []
                 annotations[row["docid"]].append(
@@ -42,9 +41,7 @@ def _parse_corpus(
                     )
                 )
 
-        entities, failed_entity_lookups, _ = fetch_entity_information(tuple(entity_qids), self._language)
-
-        return entities, failed_entity_lookups, annotations
+        return annotations
 
     def clean_assets(self) -> None:
         # No cleaning necessary, just copy all data into /clean.
@@ -60,6 +57,10 @@ def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> Lis
             title_file.seek(0)
             n_annots_available = 0
             n_annots_assigned = 0
+            entities = load_entities(
+                qids=tuple({annot.entity_id for annots in self._annotations.values() for annot in annots}),
+                language=self._language
+            )
 
             with tqdm.tqdm(
                 desc="Creating doc objects", total=row_count, leave=False
@@ -83,7 +84,7 @@ def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> Lis
                         doc_annots = self._annotations.get(row["docid"], [])
                         doc.ents, _ = create_spans_from_doc_annotation(
                             doc=doc,
-                            entities_info=self._entities,
+                            entities_info=entities,
                             annotations=doc_annots,
                             harmonize_with_doc_ents=True,
                         )
diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index 54b26d217..d6aca8e47 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -11,7 +11,6 @@
 def test_nel_benchmark():
     overrides_key = "SPACY_CONFIG_OVERRIDES"
     root = Path(__file__).parent
-
     project_run(root, "download_mewsli9", capture=True)
     project_run(root, "download_model", capture=True)
     project_run(root, "wikid_clone", capture=True)
@@ -25,7 +24,7 @@ def test_nel_benchmark():
     # Re-enable config overrides, if set before.
     if overrides:
         os.environ[overrides_key] = overrides
-    project_run(root, "parse_corpus", capture=True)
+    project_run(root, "extract_annotations", capture=True)
     project_run(root, "compile_corpora", capture=True)
     project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1, "vars.training_max_epochs": 1})
     project_run(root, "evaluate", capture=True)

From 1533943269ed2872a630591bb1748a5603a3aa31 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 23 Nov 2022 21:38:52 +0100
Subject: [PATCH 27/38] Changes for integration of WikiKB into project
 workflow.

---
 benchmarks/nel/configs/nel.cfg                |  7 ++--
 benchmarks/nel/project.yml                    | 22 +++++++------
 benchmarks/nel/requirements.txt               |  4 +--
 benchmarks/nel/scripts/cli_compile_corpora.py | 15 +++++++--
 benchmarks/nel/scripts/custom_functions.py    | 18 +++++++++++
 benchmarks/nel/scripts/datasets/dataset.py    | 28 ++++++++++------
 benchmarks/nel/scripts/datasets/mewsli_9.py   | 32 +++++++++++++------
 benchmarks/nel/scripts/datasets/utils.py      | 16 ----------
 benchmarks/nel/scripts/train.sh               |  5 +--
 9 files changed, 92 insertions(+), 55 deletions(-)
 create mode 100644 benchmarks/nel/scripts/custom_functions.py

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index ffc612369..c10ee90bd 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -6,6 +6,7 @@ dev = ""
 raw = null
 init_tok2vec = null
 kb = ""
+db = ""
 base_nlp = ""
 vectors = "${paths.base_nlp}"
 
@@ -41,9 +42,11 @@ incl_context = true
 incl_prior = true
 labels_discard = []
 get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+get_candidates_all = {"@misc":"spacy.CandidateAllGenerator.v1"}
+candidates_doc_mode = True
 
 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null
 
 [components.entity_linker.model.tok2vec]
@@ -68,7 +71,7 @@ lookups = null
 [initialize.components.entity_linker]
 
 [initialize.components.entity_linker.kb_loader]
-@misc = "spacy.KBFromFile.v1"
+@misc = "spacy.WikiKBFromFile.v1"
 kb_path = ${paths.kb}
 
 [initialize.tokenizer]
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 0dcd87b11..a70125c7e 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -4,7 +4,7 @@ vars:
   run: "cg-default"
   language: "en"
   config: "nel.cfg"
-  vectors_model: "en_core_web_lg"
+  base_model: "en_core_web_lg"
   version: "0.0.6"
   dataset: "mewsli_9"
   gpu_id: ""
@@ -46,12 +46,12 @@ commands:
   - name: download_model
     help: "Download a model with pretrained vectors and NER component."
     script:
-      - "python -m spacy download ${vars.vectors_model}"
+      - "python -m spacy download ${vars.base_model}"
 
   - name: wikid_clone
     help: "Clone `wikid` to prepare Wiki database and `KnowledgeBase`."
     script:
-      - "git clone https://github.com/rmitsch/wikid.git --branch feature/kb
+      - "git clone https://github.com/rmitsch/wikid.git --branch fix/reestablish-db-connection-after-load"
       - "pip install -r wikid/requirements.txt"
 
   - name: preprocess
@@ -80,12 +80,12 @@ commands:
   - name: wikid_create_kb
     help: "Create the knowledge base and write it to file."
     script:
-      - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model} --force"
+      - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.base_model} --force"
     deps:
       - "wikid/output/${vars.language}/wiki.sqlite3"
-    outputs_no_cache:
+    outputs:
       - "wikid/output/${vars.language}/kb"
-      - "wikid/output/${vars.language}/nlp"
+      - "wikid/output/${vars.language}/wiki.annoy"
 
   - name: extract_annotations
     help: "Extract annotations from corpus."
@@ -100,11 +100,10 @@ commands:
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}"
+      - "env PYTHONPATH=. python ./scripts/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.base_model} ${vars.filter}"
     deps:
       - "assets/${vars.dataset}/annotations.pkl"
       - "wikid/output/${vars.language}/kb"
-      - "wikid/output/${vars.language}/nlp"
       - "configs/datasets.yml"
     outputs:
       - "corpora/${vars.dataset}/train.spacy"
@@ -119,7 +118,8 @@ commands:
       - "training/${vars.dataset}/${vars.run}"
     deps:
       - "wikid/output/${vars.language}/kb"
-      - "wikid/output/${vars.language}/nlp"
+      - "wikid/output/${vars.language}/wiki.annoy"
+      - "training/base-nlp/${vars.language}"
       - "corpora/${vars.dataset}/train.spacy"
       - "corpora/${vars.dataset}/dev.spacy"
 
@@ -129,7 +129,9 @@ commands:
       - "env PYTHONPATH=. python ./scripts/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
     deps:
       - "training/${vars.dataset}/${vars.run}/model-best"
-      - "wikid/output/${vars.language}/nlp"
+      - "training/base-nlp/${vars.language}"
+      - "wikid/output/${vars.language}/wiki.annoy"
+      - "training/base-nlp/${vars.language}"
       - "corpora/${vars.dataset}/dev.spacy"
     outputs:
       - "evaluation/${vars.dataset}"
diff --git a/benchmarks/nel/requirements.txt b/benchmarks/nel/requirements.txt
index 2fe786dcc..037679068 100644
--- a/benchmarks/nel/requirements.txt
+++ b/benchmarks/nel/requirements.txt
@@ -1,7 +1,5 @@
+spacy @ git+https://github.com/rmitsch/spaCy.git@feature/candidate-generation-by-docs
 pyyaml
 tqdm
 prettytable
-scikit-learn
-fuzzyset2
-spacyfishing
 virtualenv
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/cli_compile_corpora.py b/benchmarks/nel/scripts/cli_compile_corpora.py
index dce834ce8..4b682ee8b 100644
--- a/benchmarks/nel/scripts/cli_compile_corpora.py
+++ b/benchmarks/nel/scripts/cli_compile_corpora.py
@@ -1,20 +1,29 @@
 """ Compiles train/dev/test corpora. """
+from pathlib import Path
+from typing import Set, Optional
 
 import typer
 from datasets.dataset import Dataset
-from wikid import read_filter_terms
 
 
-def main(dataset_name: str, language: str, use_filter_terms: bool = typer.Argument(False)):
+def main(dataset_name: str, language: str, model: str, use_filter_terms: bool = typer.Argument(False)):
     """Create corpora in spaCy format.
     dataset_name (str): Dataset name.
     language (str): Language.
+    model (str): Name or path of model with tokenizer, tok2vec and parser.
     use_filter_terms (bool): Whether to use the filter terms defined in the dataset config. If True, only documents
         containing at least one of the specified terms will be included in corpora. If False, all documents are
         included.
     """
+    filter_terms: Optional[Set[str]] = None
+    if use_filter_terms:
+        with open(
+            Path(__file__).parent.parent / "wikid" / "configs" / "filter_terms.txt", "r"
+        ) as file:
+            filter_terms = {ft.replace("\n", "") for ft in file.readlines()}
+
     # Run name isn't relevant for corpora compilation.
-    Dataset.generate_from_id(dataset_name, language).compile_corpora(read_filter_terms() if use_filter_terms else None)
+    Dataset.generate_from_id(dataset_name, language).compile_corpora(model, filter_terms)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
new file mode 100644
index 000000000..2dd1dbf01
--- /dev/null
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+from typing import Callable
+
+from spacy import registry, Vocab
+
+from wikid.scripts.kb import WikiKB
+
+
+@registry.misc("spacy.WikiKBFromFile.v1")
+def load_kb(kb_path: Path) -> Callable[[Vocab], WikiKB]:
+    """Loads WikiKB instance from disk.
+    kb_path (Path): Path to WikiKB file.
+    RETURNS (Callable[[Vocab], WikiKB]): Callable generating WikiKB from disk.
+    """
+    def kb_from_file(_: Vocab):
+        return WikiKB.generate_from_disk(path=kb_path)
+
+    return kb_from_file
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index aa932ebc2..ed1f7e14a 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -49,7 +49,6 @@ def __init__(self, run_name: str, language: str):
 
         self._annotations: Optional[Dict[str, List[schemas.Annotation]]] = None
         self._kb: Optional[KnowledgeBase] = None
-        self._nlp_base: Optional[Language] = None
         self._nlp_best: Optional[Language] = None
         self._annotated_docs: Optional[List[Doc]] = None
 
@@ -63,16 +62,16 @@ def assemble_paths(dataset_name: str, run_name: str, language: str) -> Dict[str,
         """
 
         root_path = Path(os.path.abspath(__file__)).parent.parent.parent
-        wikid_path = root_path / "wikid" / "output"
+        wikid_output_path = root_path / "wikid" / "output"
         assets_path = root_path / "assets" / dataset_name
 
         return {
             "root": root_path,
             "evaluation": root_path / "configs" / "evaluation.yml",
             "assets": assets_path,
-            "nlp_base": wikid_path / language / "nlp",
-            "kb": wikid_path / language / "kb",
+            "kb": wikid_output_path / language / "kb",
             "annotations": assets_path / "annotations.pkl",
+            "nlp_base": root_path / "training" / "base-nlp" / language,
             "nlp_best": root_path / "training" / dataset_name / run_name / "model-best",
             "corpora": root_path / "corpora" / dataset_name
         }
@@ -82,19 +81,31 @@ def name(self) -> str:
         """Returns dataset name."""
         raise NotImplementedError
 
-    def compile_corpora(self, filter_terms: Optional[Set[str]] = None) -> None:
+    def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -> None:
         """Creates train/dev/test corpora for dataset.
+        model (str): Name or path of model with tokenizer, tok2vec, parser, tagger, parser.
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
         """
         self._load_resource("annotations")
-        self._load_resource("nlp_base")
         Doc.set_extension("overlapping_annotations", default=None)
-        self._annotated_docs = self._create_annotated_docs(filter_terms)[:500]
+        nlp_components = ["tok2vec", "parser", "tagger", "senter", "attribute_ruler"]
+        nlp = spacy.load(model, enable=nlp_components, disable=[])
+
+        # Incorporate annotations from corpus into documents.
+        self._annotated_docs = self._create_annotated_docs(nlp, filter_terms)
+
+        # Serialize pipeline and corpora.
+        self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True)
+        nlp.to_disk(
+            self._paths["nlp_base"],
+            exclude=[comp for comp in nlp.component_names if comp not in nlp_components]
+        )
         self._serialize_corpora()
 
-    def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
+    def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
         """Creates docs annotated with entities.
+        nlp (Language): Model with tokenizer, tok2vec and parser.
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
         RETURN (List[Doc]): List of docs reflecting all entity annotations.
@@ -105,7 +116,6 @@ def extract_annotations(self, **kwargs) -> None:
         """Parses corpus and extracts annotations. Loads data on entities and mentions.
         Populates self._annotations.
         """
-        self._load_resource("nlp_base")
         logger.info("Extracting annotations from corpus")
         self._annotations = self._extract_annotations_from_corpus(**kwargs)
         with open(self._paths["annotations"], "wb") as fp:
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
index 55b980ee6..faba5b9b1 100644
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ b/benchmarks/nel/scripts/datasets/mewsli_9.py
@@ -4,7 +4,9 @@
 import time
 from typing import Tuple, Set, List, Dict, Optional
 
+import spacy
 import tqdm
+from spacy import Language
 from spacy.tokens import Doc
 
 from datasets.dataset import Dataset
@@ -47,20 +49,33 @@ def clean_assets(self) -> None:
         # No cleaning necessary, just copy all data into /clean.
         distutils.dir_util.copy_tree(str(self._paths["assets"] / "raw"), str(self._paths["assets"] / "clean"))
 
-    def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
+    def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
         annotated_docs: List[Doc] = []
 
         with open(
             self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8"
         ) as title_file:
+            # todo
+            #   - update nel.cfg with correct file path
+            #   - add KB loader - code and to config
+            #   - ensure training runs and uses WikiKB
             row_count = sum(1 for _ in title_file)
             title_file.seek(0)
             n_annots_available = 0
             n_annots_assigned = 0
-            entities = load_entities(
-                qids=tuple({annot.entity_id for annots in self._annotations.values() for annot in annots}),
-                language=self._language
-            )
+
+            # Load entities batched to avoid hitting max. number of parameters supported by SQLite.
+            batch_size = 2**14
+            qids = tuple({annot.entity_id for annots in self._annotations.values() for annot in annots})
+            entities = {
+                qid: entity_info
+                for entity_batch in
+                [
+                    load_entities(qids=qids[i:i + batch_size], language=self._language)
+                    for i in range(0, len(qids), batch_size)
+                ]
+                for qid, entity_info in entity_batch.items()
+            }
 
             with tqdm.tqdm(
                 desc="Creating doc objects", total=row_count, leave=False
@@ -80,13 +95,10 @@ def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> Lis
                             pbar.update(1)
                             continue
 
-                        doc = self._nlp_base(doc_text)
+                        doc = nlp(doc_text)
                         doc_annots = self._annotations.get(row["docid"], [])
                         doc.ents, _ = create_spans_from_doc_annotation(
-                            doc=doc,
-                            entities_info=entities,
-                            annotations=doc_annots,
-                            harmonize_with_doc_ents=True,
+                            doc=doc, entities_info=entities, annotations=doc_annots,
                         )
                         annotated_docs.append(doc)
                         n_annots_available += len(doc_annots)
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/scripts/datasets/utils.py
index 5d7d4688a..fa5281fbb 100644
--- a/benchmarks/nel/scripts/datasets/utils.py
+++ b/benchmarks/nel/scripts/datasets/utils.py
@@ -67,29 +67,17 @@ def create_spans_from_doc_annotation(
     doc: Doc,
     entities_info: Dict[str, schemas.Entity],
     annotations: List[schemas.Annotation],
-    harmonize_with_doc_ents: bool,
 ) -> Tuple[List[Span], List[schemas.Annotation]]:
     """Creates spans from annotations for one document.
     doc (Doc): Document for whom to create spans.
     entities_info (Dict[str, Entity]): All available entities.
     annotation (List[Dict[str, Union[Set[str], str, int]]]): Annotations for this post/comment.
-    harmonize_harmonize_with_doc_ents (Language): Whether to only keep those annotations matched by entities in the
-        provided Doc object.
     RETURNS (Tuple[List[Span], List[Dict[str, Union[Set[str], str, int]]]]): List of doc spans for annotated entities;
         list of overlapping entities.
     """
-    doc_ents_idx = {
-        # spaCy sometimes includes leading articles in entities, our benchmark datasets don't. Hence we drop all leading
-        # "the " and adjust the entity positions accordingly.
-        (ent.start_char + (0 if not ent.text.lower().startswith("the ") else 4), ent.end_char)
-        for ent in doc.ents
-    }
     doc_annots: List[schemas.Annotation] = []
     overlapping_doc_annotations: List[schemas.Annotation] = []
 
-    if harmonize_with_doc_ents and len(doc_ents_idx) == 0:
-        return [], []
-
     for i, annot_data in enumerate(
         sorted(
             [
@@ -120,10 +108,6 @@ def create_spans_from_doc_annotation(
                 annot.end_pos = token.idx + len(token)
                 break
 
-        # After token alignment: filter with NER pipeline, if available.
-        if harmonize_with_doc_ents and (annot.start_pos, annot.end_pos) not in doc_ents_idx:
-            continue
-
         # If there is an overlap between annotation's start and end position and this token's parsed start
         # and end, we try to create a span with this token's position.
         overlaps = False
diff --git a/benchmarks/nel/scripts/train.sh b/benchmarks/nel/scripts/train.sh
index 6a615ecab..9b6a31744 100644
--- a/benchmarks/nel/scripts/train.sh
+++ b/benchmarks/nel/scripts/train.sh
@@ -9,13 +9,14 @@ gpu_id="${6:--1}"
 #   (4) config file name,
 #   (5) max. steps.
 #   (6) GPU information if GPU is to be used.
-PYTHONPATH=scripts python -m spacy train configs/$4 \
+PYTHONPATH='scripts' python -m spacy train configs/$4 \
           --paths.dataset_name $1 \
           --output training/$1/$2 \
           --paths.train corpora/$1/train.spacy \
           --paths.dev corpora/$1/dev.spacy \
           --paths.kb wikid/output/$3/kb \
-          --paths.base_nlp wikid/output/$3/nlp \
+          --paths.db wikid/output/$3/wiki.sqlite3 \
+          --paths.base_nlp training/base-nlp/$3 \
           --paths.language $3 \
           --training.max_steps $5 \
           -c scripts/custom_functions.py \

From c0d7ebfff67fde2536f719ae119fe979fae42bc6 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 23 Nov 2022 22:18:12 +0100
Subject: [PATCH 28/38] Trigger new tests.

---
 benchmarks/nel/scripts/custom_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
index 2dd1dbf01..f280f710b 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -12,7 +12,7 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], WikiKB]:
     kb_path (Path): Path to WikiKB file.
     RETURNS (Callable[[Vocab], WikiKB]): Callable generating WikiKB from disk.
     """
-    def kb_from_file(_: Vocab):
+    def kb_from_file(_: Vocab) -> WikiKB:
         return WikiKB.generate_from_disk(path=kb_path)
 
     return kb_from_file

From 9757d3544d185d4b844581debca6e357dad9b936 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 24 Nov 2022 17:13:05 +0100
Subject: [PATCH 29/38] Further adjustments for new-style KBs and removal of
 NER.

---
 benchmarks/nel/configs/nel.cfg                | 12 +++---
 benchmarks/nel/requirements.txt               |  3 +-
 benchmarks/nel/scripts/cli_compile_corpora.py |  3 +-
 benchmarks/nel/scripts/cli_evaluate.py        |  6 ++-
 benchmarks/nel/scripts/custom_functions.py    | 41 +++++++++++++++++--
 benchmarks/nel/scripts/datasets/dataset.py    | 29 ++++++-------
 6 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index c10ee90bd..691c8e35c 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -18,7 +18,7 @@ gpu_allocator = null
 
 [nlp]
 lang = "en"
-pipeline = ["senter","ner","entity_linker"]
+pipeline = ["senter","entity_linker"]
 disabled = []
 before_creation = null
 after_creation = null
@@ -31,10 +31,6 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components.senter]
 source = "${paths.base_nlp}"
 
-[components.ner]
-source = "${paths.base_nlp}"
-component = "ner"
-
 [components.entity_linker]
 factory = "entity_linker"
 entity_vector_length = 64
@@ -80,8 +76,10 @@ kb_path = ${paths.kb}
 [corpora]
 
 [corpora.train]
-@readers = "spacy.Corpus.v1"
+@readers = "EntityEnrichedCorpusReader.v1"
 path = ${paths.train}
+dataset_name = ${paths.dataset_name}
+
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
@@ -100,7 +98,7 @@ accumulate_gradient = 2
 max_epochs = 0
 max_steps = 500
 annotating_components = ["senter"]
-frozen_components = ["senter","ner"]
+frozen_components = ["senter"]
 before_to_disk = null
 
 [training.logger]
diff --git a/benchmarks/nel/requirements.txt b/benchmarks/nel/requirements.txt
index 037679068..dbc056b70 100644
--- a/benchmarks/nel/requirements.txt
+++ b/benchmarks/nel/requirements.txt
@@ -2,4 +2,5 @@ spacy @ git+https://github.com/rmitsch/spaCy.git@feature/candidate-generation-by
 pyyaml
 tqdm
 prettytable
-virtualenv
\ No newline at end of file
+virtualenv
+spacyfishing
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/cli_compile_corpora.py b/benchmarks/nel/scripts/cli_compile_corpora.py
index 4b682ee8b..33a9de64d 100644
--- a/benchmarks/nel/scripts/cli_compile_corpora.py
+++ b/benchmarks/nel/scripts/cli_compile_corpora.py
@@ -27,4 +27,5 @@ def main(dataset_name: str, language: str, model: str, use_filter_terms: bool =
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    main("mewsli_9", "en", "en_core_web_lg", True)
+    # typer.run(main)
diff --git a/benchmarks/nel/scripts/cli_evaluate.py b/benchmarks/nel/scripts/cli_evaluate.py
index 36e4a09b1..8a2d9ebc3 100644
--- a/benchmarks/nel/scripts/cli_evaluate.py
+++ b/benchmarks/nel/scripts/cli_evaluate.py
@@ -1,7 +1,6 @@
 """ Evaluation on test data. """
 from datasets.dataset import Dataset
 import typer
-from custom_functions import create_candidates_via_embeddings
 
 
 def main(dataset_name: str, run_name: str, language: str):
@@ -10,9 +9,12 @@ def main(dataset_name: str, run_name: str, language: str):
     run_name (str): Run name.
     language (str): Language.
     """
+    # todo
+    #   - add custom loader making sure that, for training, entities are in documents loaded from docbin
+    #   - figure out spacy.load() issue (issue with to_disk() in combination with wikikb?)
     Dataset.generate_from_id(dataset_name, language, run_name).evaluate(run_name=run_name)
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "cg-lexical", "en")
+    main("mewsli_9", "cg-default", "en")
     # typer.run(main)
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
index f280f710b..8afa79ace 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -1,15 +1,50 @@
+import functools
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Iterable
 
-from spacy import registry, Vocab
+import spacy
+from spacy import registry, Vocab, Language
+from spacy.tokens import DocBin
+from spacy.training import Example
 
 from wikid.scripts.kb import WikiKB
 
 
+@spacy.registry.readers("EntityEnrichedCorpusReader.v1")
+def create_docbin_reader(path: Path, dataset_name: str) -> Callable[[Language], Iterable[Example]]:
+    """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
+    specified in the corpus annotations.
+    path (Path): Path to DocBin file with documents to prepare.
+    dataset_name (str): Dataset name/ID.
+    """
+    # todo read_files as local function?
+    return functools.partial(read_files, path)
+
+
+def read_files(path: Path, nlp: Language) -> Iterable[Example]:
+    # todo docstring
+    # we run the full pipeline and not just nlp.make_doc to ensure we have entities and sentences
+    # which are needed during training of the entity linker.
+    with nlp.select_pipes(disable="entity_linker"):
+        doc_bin = DocBin().from_disk(path)
+        docs = list(doc_bin.get_docs(nlp.vocab))
+        print(len(docs))
+        for doc in docs:
+            print("***", doc.ents, len(doc.ents))
+            doc = nlp(doc.text)
+            # todo set entities in predicted doc (with entity_id == NIL).
+            if len(doc.ents):
+                print(doc)
+                for ent in doc.ents:
+                    print("  ", ent.ent_id_, ent.start_char, ent.end_char)
+                print("------")
+            yield Example(nlp(doc.text), doc)
+
+
 @registry.misc("spacy.WikiKBFromFile.v1")
 def load_kb(kb_path: Path) -> Callable[[Vocab], WikiKB]:
     """Loads WikiKB instance from disk.
-    kb_path (Path): Path to WikiKB file.
+    kb_path (Path): Path to WikiKB path.
     RETURNS (Callable[[Vocab], WikiKB]): Callable generating WikiKB from disk.
     """
     def kb_from_file(_: Vocab) -> WikiKB:
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index ed1f7e14a..1836d817d 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -89,11 +89,12 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         """
         self._load_resource("annotations")
         Doc.set_extension("overlapping_annotations", default=None)
-        nlp_components = ["tok2vec", "parser", "tagger", "senter", "attribute_ruler"]
-        nlp = spacy.load(model, enable=nlp_components, disable=[])
+        nlp_components = ["tok2vec", "parser", "tagger", "attribute_ruler"]
+        nlp = spacy.load(model, enable=nlp_components)
 
-        # Incorporate annotations from corpus into documents.
-        self._annotated_docs = self._create_annotated_docs(nlp, filter_terms)
+        # Incorporate annotations from corpus into documents. Only keep docs with entities (relevant mostly when working
+        # with filtered data).
+        self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)]
 
         # Serialize pipeline and corpora.
         self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True)
@@ -104,7 +105,7 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         self._serialize_corpora()
 
     def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
-        """Creates docs annotated with entities.
+        """Creates docs annotated with entities. This should set documents `ents` attribute.
         nlp (Language): Model with tokenizer, tok2vec and parser.
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
@@ -161,8 +162,7 @@ def _serialize_corpora(self) -> None:
 
         for key, idx in indices.items():
             corpus = DocBin(store_user_data=True, docs=[self._annotated_docs[i] for i in idx])
-            if not self._paths["corpora"].exists():
-                self._paths["corpora"].mkdir()
+            self._paths["corpora"].mkdir(parents=True, exist_ok=True)
             corpus.to_disk(self._paths["corpora"] / f"{key}.spacy")
         logger.info(f"Completed serializing corpora at {self._paths['corpora']}.")
 
@@ -174,14 +174,14 @@ def _load_resource(self, key: str, force: bool = False) -> None:
 
         path = self._paths[key]
 
-        if key == "nlp_base" and (force or not self._nlp_base):
-            self._nlp_base = spacy.load(path)
-        elif key == "nlp_best" and (force or not self._nlp_best):
+        if key == "nlp_best" and (force or not self._nlp_best):
             self._nlp_best = spacy.load(path)
         elif key == "kb" and (force or not self._kb):
-            self._load_resource("nlp_base")
+            nlp = spacy.load(self._paths["nlp_base"])
+            # todo how to load knowledgebase if not all arguments are known?
+            #   mandate factory method?
             self._kb = KnowledgeBase(
-                vocab=self._nlp_base.vocab,
+                vocab=nlp.vocab,
                 entity_vector_length=self._nlp_base.vocab.vectors_length,
             )
             self._kb.from_disk(path)
@@ -193,9 +193,10 @@ def evaluate(self, run_name: str) -> None:
         """Evaluates trained pipeline on test set.
         run_name (str): Run name.
         """
-        self._load_resource("nlp_best")
-        self._load_resource("nlp_base")
+        # todo load KB with entity_linker.kb_loader (or retrieve directly from nlp?)
+        nlp = spacy.load(self._paths["nlp_best"])
         self._load_resource("kb")
+        self._load_resource("nlp_best")
 
         with open(self._paths["evaluation"], "r") as config_file:
             eval_config = yaml.safe_load(config_file)

From bddbbdba5e4a6a0c8cc584f57cabdd30a570091d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 29 Nov 2022 17:05:36 +0100
Subject: [PATCH 30/38] Fix entity handling. Simplify workflow in Dataset
 class.

---
 benchmarks/nel/configs/nel.cfg                | 12 ++--
 benchmarks/nel/scripts/cli_compile_corpora.py |  3 +-
 benchmarks/nel/scripts/cli_evaluate.py        |  4 +-
 benchmarks/nel/scripts/custom_functions.py    | 71 ++++++++++++-------
 benchmarks/nel/scripts/datasets/dataset.py    | 57 ++++-----------
 5 files changed, 68 insertions(+), 79 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 691c8e35c..de0713ef9 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -18,7 +18,7 @@ gpu_allocator = null
 
 [nlp]
 lang = "en"
-pipeline = ["senter","entity_linker"]
+pipeline = ["senter", "parser", "entity_linker"]
 disabled = []
 before_creation = null
 after_creation = null
@@ -31,6 +31,9 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components.senter]
 source = "${paths.base_nlp}"
 
+[components.parser]
+source = "${paths.base_nlp}"
+
 [components.entity_linker]
 factory = "entity_linker"
 entity_vector_length = 64
@@ -40,6 +43,7 @@ labels_discard = []
 get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
 get_candidates_all = {"@misc":"spacy.CandidateAllGenerator.v1"}
 candidates_doc_mode = True
+generate_empty_kb = {"@misc":"spacy.EmptyWikiKB.v1"}
 
 [components.entity_linker.model]
 @architectures = "spacy.EntityLinker.v2"
@@ -78,8 +82,6 @@ kb_path = ${paths.kb}
 [corpora.train]
 @readers = "EntityEnrichedCorpusReader.v1"
 path = ${paths.train}
-dataset_name = ${paths.dataset_name}
-
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
@@ -97,8 +99,8 @@ eval_frequency = 200
 accumulate_gradient = 2
 max_epochs = 0
 max_steps = 500
-annotating_components = ["senter"]
-frozen_components = ["senter"]
+annotating_components = ["senter","parser"]
+frozen_components = ["senter","parser"]
 before_to_disk = null
 
 [training.logger]
diff --git a/benchmarks/nel/scripts/cli_compile_corpora.py b/benchmarks/nel/scripts/cli_compile_corpora.py
index 33a9de64d..4b682ee8b 100644
--- a/benchmarks/nel/scripts/cli_compile_corpora.py
+++ b/benchmarks/nel/scripts/cli_compile_corpora.py
@@ -27,5 +27,4 @@ def main(dataset_name: str, language: str, model: str, use_filter_terms: bool =
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "en", "en_core_web_lg", True)
-    # typer.run(main)
+    typer.run(main)
diff --git a/benchmarks/nel/scripts/cli_evaluate.py b/benchmarks/nel/scripts/cli_evaluate.py
index 8a2d9ebc3..6f52cdd02 100644
--- a/benchmarks/nel/scripts/cli_evaluate.py
+++ b/benchmarks/nel/scripts/cli_evaluate.py
@@ -1,5 +1,6 @@
 """ Evaluation on test data. """
 from datasets.dataset import Dataset
+import custom_functions
 import typer
 
 
@@ -9,9 +10,6 @@ def main(dataset_name: str, run_name: str, language: str):
     run_name (str): Run name.
     language (str): Language.
     """
-    # todo
-    #   - add custom loader making sure that, for training, entities are in documents loaded from docbin
-    #   - figure out spacy.load() issue (issue with to_disk() in combination with wikikb?)
     Dataset.generate_from_id(dataset_name, language, run_name).evaluate(run_name=run_name)
 
 
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
index 8afa79ace..94581061c 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -1,4 +1,3 @@
-import functools
 from pathlib import Path
 from typing import Callable, Iterable
 
@@ -6,39 +5,36 @@
 from spacy import registry, Vocab, Language
 from spacy.tokens import DocBin
 from spacy.training import Example
+from spacy.pipeline import EntityLinker
 
 from wikid.scripts.kb import WikiKB
 
 
 @spacy.registry.readers("EntityEnrichedCorpusReader.v1")
-def create_docbin_reader(path: Path, dataset_name: str) -> Callable[[Language], Iterable[Example]]:
+def create_docbin_reader(path: Path) -> Callable[[Language], Iterable[Example]]:
     """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
     specified in the corpus annotations.
     path (Path): Path to DocBin file with documents to prepare.
-    dataset_name (str): Dataset name/ID.
     """
-    # todo read_files as local function?
-    return functools.partial(read_files, path)
-
-
-def read_files(path: Path, nlp: Language) -> Iterable[Example]:
-    # todo docstring
-    # we run the full pipeline and not just nlp.make_doc to ensure we have entities and sentences
-    # which are needed during training of the entity linker.
-    with nlp.select_pipes(disable="entity_linker"):
-        doc_bin = DocBin().from_disk(path)
-        docs = list(doc_bin.get_docs(nlp.vocab))
-        print(len(docs))
-        for doc in docs:
-            print("***", doc.ents, len(doc.ents))
-            doc = nlp(doc.text)
-            # todo set entities in predicted doc (with entity_id == NIL).
-            if len(doc.ents):
-                print(doc)
-                for ent in doc.ents:
-                    print("  ", ent.ent_id_, ent.start_char, ent.end_char)
-                print("------")
-            yield Example(nlp(doc.text), doc)
+    def read_docbin(nlp: Language) -> Iterable[Example]:
+        """Read DocBin for training. Set all entities as they appear in the annotated corpus, but set entity type to
+        NIL.
+        nlp (Language): Pipeline to use for creating document used in EL from reference document.
+        """
+        nlp.disable_pipe("entity_linker")
+
+        with nlp.select_pipes(disable="entity_linker"):
+            for doc in DocBin().from_disk(path).get_docs(nlp.vocab):
+                pred_doc = nlp(doc.text)
+                pred_doc.ents = [
+                    doc.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL)
+                    for ent in doc.ents
+                ]
+                yield Example(pred_doc, doc)
+
+        nlp.enable_pipe("entity_linker")
+
+    return read_docbin
 
 
 @registry.misc("spacy.WikiKBFromFile.v1")
@@ -51,3 +47,28 @@ def kb_from_file(_: Vocab) -> WikiKB:
         return WikiKB.generate_from_disk(path=kb_path)
 
     return kb_from_file
+
+
+@registry.misc("spacy.EmptyWikiKB.v1")
+def empty_wiki_kb() -> Callable[[Vocab, int], WikiKB]:
+    """Generates empty WikiKB instance.
+    RETURNS (Callable[[Vocab, int], WikiKB]): Callable generating WikiKB from disk.
+    """
+    def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
+        """Generates new WikiKB instance.
+        Since WikiKB relies on an external DB file that we have no information on at this point, this instance will not
+        have initialized its DB connection. Also, its parameters specified at init are arbitrarily chosen. This only
+        serves to return a placeholder WikiKB instance to be overwritten using .from_bytes() or .from_disk().
+        vocab (Vocab): Vocab instance.
+        entity_vector_length (int): Entity vector length.
+        """
+        return WikiKB(
+            vocab=vocab,
+            entity_vector_length=entity_vector_length,
+            db_path=Path("."),
+            annoy_path=Path(".annoy"),
+            language=".",
+            establish_db_connection_at_init=False
+        )
+
+    return empty_kb_factory
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index 1836d817d..abbcde618 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -24,6 +24,7 @@
 from spacy.pipeline import EntityLinker
 
 from wikid import schemas
+from wikid.scripts.kb import WikiKB
 from . import evaluation
 from utils import get_logger
 
@@ -87,7 +88,8 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified
             terms will be included in corpora. If None, all documents are included.
         """
-        self._load_resource("annotations")
+        with open(self._paths["annotations"], "rb") as file:
+            self._annotations = pickle.load(file)
         Doc.set_extension("overlapping_annotations", default=None)
         nlp_components = ["tok2vec", "parser", "tagger", "attribute_ruler"]
         nlp = spacy.load(model, enable=nlp_components)
@@ -166,43 +168,19 @@ def _serialize_corpora(self) -> None:
             corpus.to_disk(self._paths["corpora"] / f"{key}.spacy")
         logger.info(f"Completed serializing corpora at {self._paths['corpora']}.")
 
-    def _load_resource(self, key: str, force: bool = False) -> None:
-        """Loads serialized resource.
-        key (str): Resource key. Must be in self._paths.
-        force (bool): Load from disk even if already not None.
-        """
-
-        path = self._paths[key]
-
-        if key == "nlp_best" and (force or not self._nlp_best):
-            self._nlp_best = spacy.load(path)
-        elif key == "kb" and (force or not self._kb):
-            nlp = spacy.load(self._paths["nlp_base"])
-            # todo how to load knowledgebase if not all arguments are known?
-            #   mandate factory method?
-            self._kb = KnowledgeBase(
-                vocab=nlp.vocab,
-                entity_vector_length=self._nlp_base.vocab.vectors_length,
-            )
-            self._kb.from_disk(path)
-        elif key == "annotations" and (force or not self._annotations):
-            with open(path, "rb") as file:
-                self._annotations = pickle.load(file)
-
     def evaluate(self, run_name: str) -> None:
         """Evaluates trained pipeline on test set.
         run_name (str): Run name.
         """
-        # todo load KB with entity_linker.kb_loader (or retrieve directly from nlp?)
-        nlp = spacy.load(self._paths["nlp_best"])
-        self._load_resource("kb")
-        self._load_resource("nlp_best")
+        nlp_base = spacy.load(self._paths["nlp_base"])
+        self._nlp_best = spacy.load(self._paths["nlp_best"])
+        self._kb = WikiKB.generate_from_disk(self._paths["kb"])
 
         with open(self._paths["evaluation"], "r") as config_file:
             eval_config = yaml.safe_load(config_file)
 
         if eval_config["external"]["spacyfishing"]:
-            self._nlp_base.add_pipe("entityfishing", last=True)
+            nlp_base.add_pipe("entityfishing", last=True)
 
         # Apply config overrides, if defined.
         if "config_overrides" in eval_config and eval_config["config_overrides"]:
@@ -212,29 +190,18 @@ def evaluate(self, run_name: str) -> None:
         # Infer test set.
         test_set_path = self._paths["corpora"] / "test.spacy"
         docs = list(DocBin().from_disk(test_set_path).get_docs(self._nlp_best.vocab))
-        # spaCy sometimes includes leading articles in entities, our benchmark datasets don't. Hence we drop all
-        # leading "the " and adjust the entity positions accordingly.
-        for doc in docs:
-            doc.ents = [
-                doc.char_span(ent.start_char + 4, ent.end_char, label=ent.label, kb_id=ent.kb_id)
-                if ent.text.lower().startswith("the ") else ent
-                for ent in doc.ents
-            ]
-
-        for doc in docs:
-            pred_doc = self._nlp_best(doc)
 
         test_set = [
             Example(predicted_doc, doc)
             for predicted_doc, doc in zip(
                 [
                     doc for doc in tqdm.tqdm(
-                        self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500),
+                        self._nlp_best.pipe(texts=docs, n_process=-1, batch_size=500),
                         desc="Inferring entities for test set",
                         total=len(docs)
                     )
                 ],
-                DocBin().from_disk(self._paths["corpora"] / "test.spacy").get_docs(self._nlp_best.vocab)
+                docs
             )
         ]
 
@@ -249,14 +216,14 @@ def evaluate(self, run_name: str) -> None:
         for example in tqdm.tqdm(test_set, total=len(test_set), leave=True, desc="Evaluating test set"):
             example: Example
             if len(example) > 0:
-                entity_linker: Union[EntityLinker, EntityLinker_v1] = \
-                    self._nlp_best.get_pipe("entity_linker")  # type: ignore
+                entity_linker: EntityLinker = self._nlp_best.get_pipe("entity_linker")  # type: ignore
                 ent_gold_ids = {
                     evaluation.offset(ent.start_char, ent.end_char): ent.kb_id_ for ent in example.reference.ents
                 }
                 if len(ent_gold_ids) == 0:
                     continue
                 ent_pred_labels = {(ent.start_char, ent.end_char): ent.label_ for ent in example.predicted.ents}
+                # todo switch to get_candidates_all() here?
                 ent_cands = {
                     (ent.start_char, ent.end_char): {
                         cand.entity_: cand for cand in entity_linker.get_candidates(self._kb, ent)
@@ -271,6 +238,8 @@ def evaluate(self, run_name: str) -> None:
                         # For the candidate generation evaluation also mis-aligned entities are considered.
                         label = ent_pred_labels.get(ent_offset, "NIL")
                         cand_gen_label_counts[label] += 1
+                        if ent.kb_id_ not in set(ent_cands.get(ent_offset, {})):
+                            print(ent.kb_id_, set(ent_cands.get(ent_offset, {})))
                         candidate_results.update_metrics(label, ent.kb_id_, set(ent_cands.get(ent_offset, {}).keys()))
 
                 # Update entity disambiguation stats for baselines.

From de92c617114e1546f51a31fe34393eb971ab4d04 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 29 Nov 2022 22:41:28 +0100
Subject: [PATCH 31/38] Switch evaluation to get_candidates_all().

---
 benchmarks/nel/scripts/datasets/dataset.py | 27 +++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index abbcde618..f419223ca 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -178,7 +178,6 @@ def evaluate(self, run_name: str) -> None:
 
         with open(self._paths["evaluation"], "r") as config_file:
             eval_config = yaml.safe_load(config_file)
-
         if eval_config["external"]["spacyfishing"]:
             nlp_base.add_pipe("entityfishing", last=True)
 
@@ -223,12 +222,12 @@ def evaluate(self, run_name: str) -> None:
                 if len(ent_gold_ids) == 0:
                     continue
                 ent_pred_labels = {(ent.start_char, ent.end_char): ent.label_ for ent in example.predicted.ents}
-                # todo switch to get_candidates_all() here?
-                ent_cands = {
-                    (ent.start_char, ent.end_char): {
-                        cand.entity_: cand for cand in entity_linker.get_candidates(self._kb, ent)
-                    }
-                    for ent in example.reference.ents
+                ent_cands_by_offset = {
+                    (ent.start_char, ent.end_char): {cand.entity_: cand for cand in ent_cands}
+                    for ent, ent_cands in zip(
+                        example.reference.ents,
+                        next(entity_linker.get_candidates_all(self._kb, (ents for ents in [example.reference.ents])))
+                    )
                 }
 
                 # Update candidate generation stats.
@@ -238,9 +237,9 @@ def evaluate(self, run_name: str) -> None:
                         # For the candidate generation evaluation also mis-aligned entities are considered.
                         label = ent_pred_labels.get(ent_offset, "NIL")
                         cand_gen_label_counts[label] += 1
-                        if ent.kb_id_ not in set(ent_cands.get(ent_offset, {})):
-                            print(ent.kb_id_, set(ent_cands.get(ent_offset, {})))
-                        candidate_results.update_metrics(label, ent.kb_id_, set(ent_cands.get(ent_offset, {}).keys()))
+                        candidate_results.update_metrics(
+                            label, ent.kb_id_, set(ent_cands_by_offset.get(ent_offset, {}).keys())
+                        )
 
                 # Update entity disambiguation stats for baselines.
                 evaluation.add_disambiguation_baseline(
@@ -248,15 +247,15 @@ def evaluate(self, run_name: str) -> None:
                     label_counts,
                     example.predicted,
                     ent_gold_ids,
-                    ent_cands,
+                    ent_cands_by_offset,
                 )
 
                 # Update entity disambiguation stats for trained model.
-                evaluation.add_disambiguation_eval_result(trained_results, example.predicted, ent_gold_ids, ent_cands)
+                evaluation.add_disambiguation_eval_result(trained_results, example.predicted, ent_gold_ids, ent_cands_by_offset)
 
-                if eval_config["external"]["spacyfishing"]:
+                if eval_config["external"].get("spacyfishing", False):
                     try:
-                        doc = self._nlp_base(example.reference.text)
+                        doc = nlp_base(example.reference.text)
                     except TypeError:
                         doc = None
                     evaluation.add_disambiguation_spacyfishing_eval_result(spacyfishing_results, doc, ent_gold_ids)

From ecb3d87ea401112417a841cb34e00539b520ff05 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 29 Nov 2022 23:14:27 +0100
Subject: [PATCH 32/38] Adjust config.

---
 benchmarks/nel/configs/nel.cfg         | 4 ++--
 benchmarks/nel/project.yml             | 2 +-
 benchmarks/nel/scripts/cli_evaluate.py | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index de0713ef9..05990a198 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -37,8 +37,8 @@ source = "${paths.base_nlp}"
 [components.entity_linker]
 factory = "entity_linker"
 entity_vector_length = 64
-incl_context = true
-incl_prior = true
+incl_context = True
+incl_prior = True
 labels_discard = []
 get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
 get_candidates_all = {"@misc":"spacy.CandidateAllGenerator.v1"}
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index a70125c7e..dda5b5c88 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -1,7 +1,7 @@
 title: 'NEL Benchmark'
 description: "Pipeline for benchmarking NEL approaches (incl. candidate generation and entity disambiguation)."
 vars:
-  run: "cg-default"
+  run: "default"
   language: "en"
   config: "nel.cfg"
   base_model: "en_core_web_lg"
diff --git a/benchmarks/nel/scripts/cli_evaluate.py b/benchmarks/nel/scripts/cli_evaluate.py
index 6f52cdd02..3dbabb612 100644
--- a/benchmarks/nel/scripts/cli_evaluate.py
+++ b/benchmarks/nel/scripts/cli_evaluate.py
@@ -14,5 +14,4 @@ def main(dataset_name: str, run_name: str, language: str):
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "cg-default", "en")
-    # typer.run(main)
+    typer.run(main)

From adf8a7e5a57eada9ac274989fe4e35b03ac2d8a8 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 1 Dec 2022 17:12:18 +0100
Subject: [PATCH 33/38] Further adjustments in NEL workflow + script for
 debugging training via API.

---
 benchmarks/nel/configs/nel.cfg                | 19 +++---
 benchmarks/nel/project.yml                    |  2 +-
 benchmarks/nel/scripts/cli_compile_corpora.py |  3 +-
 benchmarks/nel/scripts/cli_evaluate.py        | 10 ++-
 benchmarks/nel/scripts/custom_functions.py    | 67 +++++++++++++++----
 benchmarks/nel/scripts/datasets/dataset.py    | 15 +++--
 benchmarks/nel/scripts/datasets/mewsli_9.py   |  4 --
 benchmarks/nel/scripts/datasets/utils.py      |  3 +-
 benchmarks/nel/scripts/train.py               | 24 +++++++
 9 files changed, 107 insertions(+), 40 deletions(-)
 create mode 100644 benchmarks/nel/scripts/train.py

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 05990a198..5fa515d9a 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -18,7 +18,7 @@ gpu_allocator = null
 
 [nlp]
 lang = "en"
-pipeline = ["senter", "parser", "entity_linker"]
+pipeline = ["entity_linker"]
 disabled = []
 before_creation = null
 after_creation = null
@@ -28,12 +28,6 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 
 [components]
 
-[components.senter]
-source = "${paths.base_nlp}"
-
-[components.parser]
-source = "${paths.base_nlp}"
-
 [components.entity_linker]
 factory = "entity_linker"
 entity_vector_length = 64
@@ -80,12 +74,15 @@ kb_path = ${paths.kb}
 [corpora]
 
 [corpora.train]
-@readers = "EntityEnrichedCorpusReader.v1"
+@readers = "EntityEnrichedCorpusReader.v2"
 path = ${paths.train}
+path_nlp_base = ${paths.vectors}
 
 [corpora.dev]
-@readers = "spacy.Corpus.v1"
+@readers = "EntityEnrichedCorpusReader.v2"
+;@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
+path_nlp_base = ${paths.vectors}
 
 
 [training]
@@ -99,8 +96,8 @@ eval_frequency = 200
 accumulate_gradient = 2
 max_epochs = 0
 max_steps = 500
-annotating_components = ["senter","parser"]
-frozen_components = ["senter","parser"]
+annotating_components = []
+frozen_components = []
 before_to_disk = null
 
 [training.logger]
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index dda5b5c88..9e56b2327 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -126,7 +126,7 @@ commands:
   - name: evaluate
     help: "Evaluate on the test set."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
+      - "env PYTHONPATH=. python ./scripts/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language} ${vars.gpu_id}"
     deps:
       - "training/${vars.dataset}/${vars.run}/model-best"
       - "training/base-nlp/${vars.language}"
diff --git a/benchmarks/nel/scripts/cli_compile_corpora.py b/benchmarks/nel/scripts/cli_compile_corpora.py
index 4b682ee8b..33a9de64d 100644
--- a/benchmarks/nel/scripts/cli_compile_corpora.py
+++ b/benchmarks/nel/scripts/cli_compile_corpora.py
@@ -27,4 +27,5 @@ def main(dataset_name: str, language: str, model: str, use_filter_terms: bool =
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    main("mewsli_9", "en", "en_core_web_lg", True)
+    # typer.run(main)
diff --git a/benchmarks/nel/scripts/cli_evaluate.py b/benchmarks/nel/scripts/cli_evaluate.py
index 3dbabb612..475b43850 100644
--- a/benchmarks/nel/scripts/cli_evaluate.py
+++ b/benchmarks/nel/scripts/cli_evaluate.py
@@ -1,17 +1,21 @@
 """ Evaluation on test data. """
+from typing import Optional
+
 from datasets.dataset import Dataset
 import custom_functions
 import typer
 
 
-def main(dataset_name: str, run_name: str, language: str):
+def main(dataset_name: str, run_name: str, language: str, gpu_id: Optional[int] = typer.Argument(None)):
     """Evaluate the trained EL component by applying it to unseen text.
     dataset_name (str): Name of dataset to evaluate on.
     run_name (str): Run name.
     language (str): Language.
+    gpu_id (Optional[int]): ID of GPU to utilize for evaluation.
     """
-    Dataset.generate_from_id(dataset_name, language, run_name).evaluate(run_name=run_name)
+    Dataset.generate_from_id(dataset_name, language, run_name).evaluate(gpu_id=gpu_id)
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    main("mewsli_9", "default", "en", 0)
+    # typer.run(main)
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py
index 94581061c..100feaa53 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/scripts/custom_functions.py
@@ -11,28 +11,67 @@
 
 
 @spacy.registry.readers("EntityEnrichedCorpusReader.v1")
-def create_docbin_reader(path: Path) -> Callable[[Language], Iterable[Example]]:
+def create_docbin_reader(path: Path, path_nlp_base: Path) -> Callable[[Language], Iterable[Example]]:
     """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
     specified in the corpus annotations.
     path (Path): Path to DocBin file with documents to prepare.
+    path_nlp_base (Path): Path to pipeline for tokenization/sentence.
     """
-    def read_docbin(nlp: Language) -> Iterable[Example]:
+    def read_docbin(_: Language) -> Iterable[Example]:
         """Read DocBin for training. Set all entities as they appear in the annotated corpus, but set entity type to
         NIL.
         nlp (Language): Pipeline to use for creating document used in EL from reference document.
         """
-        nlp.disable_pipe("entity_linker")
-
-        with nlp.select_pipes(disable="entity_linker"):
-            for doc in DocBin().from_disk(path).get_docs(nlp.vocab):
-                pred_doc = nlp(doc.text)
-                pred_doc.ents = [
-                    doc.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL)
-                    for ent in doc.ents
-                ]
-                yield Example(pred_doc, doc)
-
-        nlp.enable_pipe("entity_linker")
+        nlp = spacy.load(path_nlp_base, enable=["senter"])
+
+        for doc in DocBin().from_disk(path).get_docs(nlp.vocab):
+            pred_doc = nlp(doc.text)
+            pred_doc.ents = [
+                pred_doc.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL)
+                for ent in doc.ents
+            ]
+            sents = list(pred_doc.sents)
+            sents_orig = list(doc.sents)
+            assert len(sents) == len(sents_orig)
+            assert len(sents) > 0 and len(sents_orig) > 0
+            assert all([ent is not None for ent in pred_doc.ents])
+            assert len(doc.ents) == len(pred_doc.ents)
+            assert len(doc.ents) > 0
+
+            yield Example(pred_doc, doc)
+
+    return read_docbin
+
+
+@spacy.registry.readers("EntityEnrichedCorpusReader.v2")
+def create_docbin_reader(path: Path, path_nlp_base: Path) -> Callable[[Language], Iterable[Example]]:
+    """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
+    specified in the corpus annotations.
+    path (Path): Path to DocBin file with documents to prepare.
+    path_nlp_base (Path): Path to pipeline for tokenization/sentence.
+    """
+    def read_docbin(_: Language) -> Iterable[Example]:
+        """Read DocBin for training. Set all entities as they appear in the annotated corpus, but set entity type and KB
+        ID to NIL.
+        nlp (Language): Pipeline to use for creating document used in EL from reference document.
+        """
+        nlp = spacy.load(path_nlp_base, enable=["sentencizer"])
+        for example in spacy.training.Corpus(path)(nlp):
+            example.predicted = nlp(example.predicted)
+            example.predicted.ents = [
+                example.predicted.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL)
+                for ent in example.reference.ents
+            ]
+            sents = list(example.predicted.sents)
+            sents_orig = list(example.reference.sents)
+
+            assert len(sents) == len(sents_orig)
+            assert len(sents) > 0 and len(sents_orig) > 0
+            assert all([ent is not None for ent in example.predicted.ents])
+            assert len(example.reference.ents) == len(example.predicted.ents)
+            assert len(example.reference.ents) > 0
+
+            yield Example(example.predicted, example.reference)
 
     return read_docbin
 
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/scripts/datasets/dataset.py
index f419223ca..33e8d02fd 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/scripts/datasets/dataset.py
@@ -91,8 +91,9 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         with open(self._paths["annotations"], "rb") as file:
             self._annotations = pickle.load(file)
         Doc.set_extension("overlapping_annotations", default=None)
-        nlp_components = ["tok2vec", "parser", "tagger", "attribute_ruler"]
+        nlp_components = ["tok2vec", "tagger", "attribute_ruler"]
         nlp = spacy.load(model, enable=nlp_components)
+        nlp.add_pipe("sentencizer")
 
         # Incorporate annotations from corpus into documents. Only keep docs with entities (relevant mostly when working
         # with filtered data).
@@ -102,7 +103,7 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True)
         nlp.to_disk(
             self._paths["nlp_base"],
-            exclude=[comp for comp in nlp.component_names if comp not in nlp_components]
+            exclude=[comp for comp in nlp.component_names if comp not in [*nlp_components, "sentencizer"]]
         )
         self._serialize_corpora()
 
@@ -168,10 +169,14 @@ def _serialize_corpora(self) -> None:
             corpus.to_disk(self._paths["corpora"] / f"{key}.spacy")
         logger.info(f"Completed serializing corpora at {self._paths['corpora']}.")
 
-    def evaluate(self, run_name: str) -> None:
+    def evaluate(self, gpu_id: Optional[int] = None) -> None:
         """Evaluates trained pipeline on test set.
         run_name (str): Run name.
+        gpu_id (Optional[int]): ID of GPU to utilize.
         """
+        if gpu_id is not None:
+            spacy.require_gpu(gpu_id)
+
         nlp_base = spacy.load(self._paths["nlp_base"])
         self._nlp_best = spacy.load(self._paths["nlp_best"])
         self._kb = WikiKB.generate_from_disk(self._paths["kb"])
@@ -195,7 +200,7 @@ def evaluate(self, run_name: str) -> None:
             for predicted_doc, doc in zip(
                 [
                     doc for doc in tqdm.tqdm(
-                        self._nlp_best.pipe(texts=docs, n_process=-1, batch_size=500),
+                        self._nlp_best.pipe(texts=docs, n_process=1 if gpu_id else -1, batch_size=500),
                         desc="Inferring entities for test set",
                         total=len(docs)
                     )
@@ -273,7 +278,7 @@ def evaluate(self, run_name: str) -> None:
             eval_results.append(spacyfishing_results)
 
         logger.info(dict(cand_gen_label_counts))
-        evaluation.EvaluationResults.report(tuple(eval_results), run_name=run_name, dataset_name=self.name)
+        evaluation.EvaluationResults.report(tuple(eval_results), run_name=self._run_name, dataset_name=self.name)
 
     def compare_evaluations(self, highlight_criterion: str) -> None:
         """Generate and display table for comparison of all available runs for this dataset.
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
index faba5b9b1..d21bd2a31 100644
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ b/benchmarks/nel/scripts/datasets/mewsli_9.py
@@ -55,10 +55,6 @@ def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]]
         with open(
             self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8"
         ) as title_file:
-            # todo
-            #   - update nel.cfg with correct file path
-            #   - add KB loader - code and to config
-            #   - ensure training runs and uses WikiKB
             row_count = sum(1 for _ in title_file)
             title_file.seek(0)
             n_annots_available = 0
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/scripts/datasets/utils.py
index fa5281fbb..6dd5c6e44 100644
--- a/benchmarks/nel/scripts/datasets/utils.py
+++ b/benchmarks/nel/scripts/datasets/utils.py
@@ -3,6 +3,7 @@
 from typing import Dict, List, Set, Tuple
 import tqdm
 from spacy.tokens import Token, Span, Doc
+from spacy.pipeline import EntityLinker
 from wikid import schemas, load_entities
 
 
@@ -127,7 +128,7 @@ def create_spans_from_doc_annotation(
     doc_spans = [
         # No label/entity type information available.
         doc.char_span(
-            annot.start_pos, annot.end_pos, label="NIL", kb_id=annot.entity_id
+            annot.start_pos, annot.end_pos, label=EntityLinker.NIL, kb_id=annot.entity_id
         )
         for annot in doc_annots
     ]
diff --git a/benchmarks/nel/scripts/train.py b/benchmarks/nel/scripts/train.py
new file mode 100644
index 000000000..eed760494
--- /dev/null
+++ b/benchmarks/nel/scripts/train.py
@@ -0,0 +1,24 @@
+"""
+API call for training. Mainly for debugging purposes.
+"""
+from pathlib import Path
+import custom_functions
+from spacy.cli.train import train
+
+if __name__ == '__main__':
+    root = Path(__file__).parent.parent
+    train(
+        root / "configs" / "nel.cfg",
+        output_path=root / "training" / "mewsli_9" / "default",
+        use_gpu=0,
+        overrides={
+            "paths.dataset_name": "mewsli_9",
+            "paths.train": "corpora/mewsli_9/train.spacy",
+            "paths.dev": "corpora/mewsli_9/dev.spacy",
+            "paths.kb": "wikid/output/en/kb",
+            "paths.db": "wikid/output/en/wiki.sqlite3",
+            "paths.base_nlp": "training/base-nlp/en",
+            "paths.language": "en",
+            "training.max_steps": 10,
+        }
+    )

From 7bc9dd8c5d4d1c45dc80f6486c9ce328e07b94fc Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 5 Dec 2022 16:57:56 +0100
Subject: [PATCH 34/38] Refactor. Start to add support for separate
 mention-candidate retrieval step for easier testing.

---
 benchmarks/nel/configs/nel.cfg                |   3 +-
 benchmarks/nel/project.yml                    |  30 +++-
 benchmarks/nel/scripts/datasets/mewsli_9.py   | 107 ------------
 benchmarks/nel/{scripts => src}/__init__.py   |   0
 .../nel/{scripts => src}/cli_clean_data.py    |   0
 .../cli_compare_evaluations.py                |   0
 .../{scripts => src}/cli_compile_corpora.py   |   2 +-
 .../nel/{scripts => src}/cli_evaluate.py      |   0
 .../cli_extract_annotations.py                |   0
 .../src/cli_retrieve_mentions_candidates.py   |  16 ++
 .../nel/{scripts => src}/custom_functions.py  |  19 ++-
 .../nel/{scripts => src}/datasets/__init__.py |   0
 .../nel/{scripts => src}/datasets/dataset.py  |  60 ++++++-
 .../datasets/download_mewsli-9.sh             |   0
 .../{scripts => src}/datasets/evaluation.py   |   0
 benchmarks/nel/src/datasets/mewsli_9.py       | 155 ++++++++++++++++++
 .../nel/{scripts => src}/datasets/utils.py    |   0
 .../nel/{scripts => src}/parse_corpus.py      |   0
 benchmarks/nel/{scripts => src}/train.py      |  10 +-
 benchmarks/nel/{scripts => src}/train.sh      |   4 +-
 benchmarks/nel/{scripts => src}/utils.py      |   0
 21 files changed, 270 insertions(+), 136 deletions(-)
 delete mode 100644 benchmarks/nel/scripts/datasets/mewsli_9.py
 rename benchmarks/nel/{scripts => src}/__init__.py (100%)
 rename benchmarks/nel/{scripts => src}/cli_clean_data.py (100%)
 rename benchmarks/nel/{scripts => src}/cli_compare_evaluations.py (100%)
 rename benchmarks/nel/{scripts => src}/cli_compile_corpora.py (95%)
 rename benchmarks/nel/{scripts => src}/cli_evaluate.py (100%)
 rename benchmarks/nel/{scripts => src}/cli_extract_annotations.py (100%)
 create mode 100644 benchmarks/nel/src/cli_retrieve_mentions_candidates.py
 rename benchmarks/nel/{scripts => src}/custom_functions.py (88%)
 rename benchmarks/nel/{scripts => src}/datasets/__init__.py (100%)
 rename benchmarks/nel/{scripts => src}/datasets/dataset.py (85%)
 rename benchmarks/nel/{scripts => src}/datasets/download_mewsli-9.sh (100%)
 rename benchmarks/nel/{scripts => src}/datasets/evaluation.py (100%)
 create mode 100644 benchmarks/nel/src/datasets/mewsli_9.py
 rename benchmarks/nel/{scripts => src}/datasets/utils.py (100%)
 rename benchmarks/nel/{scripts => src}/parse_corpus.py (100%)
 rename benchmarks/nel/{scripts => src}/train.py (60%)
 rename benchmarks/nel/{scripts => src}/train.sh (86%)
 rename benchmarks/nel/{scripts => src}/utils.py (100%)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 5fa515d9a..a9b806389 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -80,7 +80,6 @@ path_nlp_base = ${paths.vectors}
 
 [corpora.dev]
 @readers = "EntityEnrichedCorpusReader.v2"
-;@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 path_nlp_base = ${paths.vectors}
 
@@ -102,7 +101,7 @@ before_to_disk = null
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
+progress_bar = true
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 9e56b2327..1b66d1551 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -10,10 +10,10 @@ vars:
   gpu_id: ""
   download_all_wiki_assets: ""  # "--extra" to download full Wiki dumps.
   filter: "True"  # Whether to only use parts of Wiki data and corpus containing filter terms.
-  training_max_steps: 1000
+  training_max_steps: 10
   eval_highlight_metric: "F"  # one of ("F", "r", "p")
 
-directories: ["assets", "training", "configs", "scripts", "corpora", "evaluation"]
+directories: ["assets", "training", "configs", "src", "corpora", "evaluation"]
 
 check_requirements: True
 
@@ -39,7 +39,7 @@ commands:
   - name: download_mewsli9
     help: Download Mewsli-9 dataset.
     script:
-      - bash scripts/datasets/download_mewsli-9.sh
+      - bash src/datasets/download_mewsli-9.sh
     outputs:
       - assets/mewsli_9/
 
@@ -57,7 +57,7 @@ commands:
   - name: preprocess
     help: Preprocess and clean corpus data.
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_clean_data.py ${vars.dataset} ${vars.language}"
+      - "env PYTHONPATH=. python ./src/cli_clean_data.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/raw"
     outputs:
@@ -90,17 +90,29 @@ commands:
   - name: extract_annotations
     help: "Extract annotations from corpus."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_extract_annotations.py ${vars.dataset} ${vars.language}"
+      - "env PYTHONPATH=. python ./src/cli_extract_annotations.py ${vars.dataset} ${vars.language}"
     deps:
       - "assets/${vars.dataset}/clean"
       - "wikid/output/${vars.language}/wiki.sqlite3"
     outputs:
       - "assets/${vars.dataset}/annotations.pkl"
 
+  - name: retrieve_mentions_candidates
+    help: "Retrieve candidates for mentions in corpus and persist them in file."
+    script:
+      - "env PYTHONPATH=. python ./src/cli_retrieve_mentions_candidates.py ${vars.dataset} ${vars.language}"
+    deps:
+      - "wikid/output/${vars.language}/kb"
+      - "wikid/output/${vars.language}/wiki.annoy"
+      - "wikid/output/${vars.language}/wiki.sqlite3"
+      - "assets/${vars.dataset}/annotations.pkl"
+    outputs:
+      - "corpora/${vars.dataset}/mention_candidates.pkl"
+
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.base_model} ${vars.filter}"
+      - "env PYTHONPATH=. python ./src/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.base_model} ${vars.filter}"
     deps:
       - "assets/${vars.dataset}/annotations.pkl"
       - "wikid/output/${vars.language}/kb"
@@ -113,7 +125,7 @@ commands:
   - name: train
     help: "Train a new Entity Linking component. Pass --vars.gpu_id GPU_ID to train with GPU. Training with some datasets may take a long time!"
     script:
-      - "bash scripts/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
+      - "bash src/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
     outputs:
       - "training/${vars.dataset}/${vars.run}"
     deps:
@@ -126,7 +138,7 @@ commands:
   - name: evaluate
     help: "Evaluate on the test set."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language} ${vars.gpu_id}"
+      - "env PYTHONPATH=. python ./src/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language} ${vars.gpu_id}"
     deps:
       - "training/${vars.dataset}/${vars.run}/model-best"
       - "training/base-nlp/${vars.language}"
@@ -139,7 +151,7 @@ commands:
   - name: compare_evaluations
     help: "Compare available set of evaluation runs."
     script:
-      - "env PYTHONPATH=. python ./scripts/cli_compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
+      - "env PYTHONPATH=. python ./src/cli_compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
     deps:
       - "evaluation/${vars.dataset}"
 
diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py
deleted file mode 100644
index d21bd2a31..000000000
--- a/benchmarks/nel/scripts/datasets/mewsli_9.py
+++ /dev/null
@@ -1,107 +0,0 @@
-""" Dataset class for Mewsli-9 dataset. """
-import csv
-import distutils.dir_util
-import time
-from typing import Tuple, Set, List, Dict, Optional
-
-import spacy
-import tqdm
-from spacy import Language
-from spacy.tokens import Doc
-
-from datasets.dataset import Dataset
-from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation
-from wikid import schemas, load_entities
-
-
-class Mewsli9Dataset(Dataset):
-    """Mewsli-9 dataset."""
-
-    @property
-    def name(self) -> str:
-        return "mewsli_9"
-
-    def _extract_annotations_from_corpus(
-        self, **kwargs
-    ) -> Dict[str, List[schemas.Annotation]]:
-        annotations: Dict[str, List[schemas.Annotation]] = {}
-
-        with open(
-            self._paths["assets"] / "clean" / "en" / "mentions.tsv", encoding="utf-8"
-        ) as file_path:
-            for i, row in enumerate(csv.DictReader(file_path, delimiter="\t")):
-                assert len(row) == 9
-
-                if row["docid"] not in annotations:
-                    annotations[row["docid"]] = []
-                annotations[row["docid"]].append(
-                    schemas.Annotation(
-                        entity_name=row["url"].split("/")[-1].replace("_", " "),
-                        entity_id=row["qid"],
-                        start_pos=int(row["position"]),
-                        end_pos=int(row["position"]) + int(row["length"]),
-                    )
-                )
-
-        return annotations
-
-    def clean_assets(self) -> None:
-        # No cleaning necessary, just copy all data into /clean.
-        distutils.dir_util.copy_tree(str(self._paths["assets"] / "raw"), str(self._paths["assets"] / "clean"))
-
-    def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
-        annotated_docs: List[Doc] = []
-
-        with open(
-            self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8"
-        ) as title_file:
-            row_count = sum(1 for _ in title_file)
-            title_file.seek(0)
-            n_annots_available = 0
-            n_annots_assigned = 0
-
-            # Load entities batched to avoid hitting max. number of parameters supported by SQLite.
-            batch_size = 2**14
-            qids = tuple({annot.entity_id for annots in self._annotations.values() for annot in annots})
-            entities = {
-                qid: entity_info
-                for entity_batch in
-                [
-                    load_entities(qids=qids[i:i + batch_size], language=self._language)
-                    for i in range(0, len(qids), batch_size)
-                ]
-                for qid, entity_info in entity_batch.items()
-            }
-
-            with tqdm.tqdm(
-                desc="Creating doc objects", total=row_count, leave=False
-            ) as pbar:
-                for row in csv.DictReader(title_file, delimiter="\t"):
-                    with open(
-                        self._paths["assets"] / "clean" / "en" / "text" / row["docid"],
-                        encoding="utf-8",
-                    ) as text_file:
-                        # Replace newlines with whitespace and \xa0 (non-breaking whitespace) appearing after titles
-                        # with a period. This maintains the correct offsets in the dataset annotations.
-                        doc_text = "".join([
-                            line.replace("\n", " ").replace("\xa0", ".") for line in text_file.readlines()
-                        ])
-
-                        if filter_terms and not any([ft in doc_text for ft in filter_terms]):
-                            pbar.update(1)
-                            continue
-
-                        doc = nlp(doc_text)
-                        doc_annots = self._annotations.get(row["docid"], [])
-                        doc.ents, _ = create_spans_from_doc_annotation(
-                            doc=doc, entities_info=entities, annotations=doc_annots,
-                        )
-                        annotated_docs.append(doc)
-                        n_annots_available += len(doc_annots)
-                        n_annots_assigned += len(doc.ents)
-                    pbar.update(1)
-
-        print(f"Assigned {n_annots_assigned} out of {n_annots_available} annotations "
-              f"({(n_annots_assigned / n_annots_available * 100):.2f}%) in {pbar.n} docs.")
-
-        return annotated_docs
diff --git a/benchmarks/nel/scripts/__init__.py b/benchmarks/nel/src/__init__.py
similarity index 100%
rename from benchmarks/nel/scripts/__init__.py
rename to benchmarks/nel/src/__init__.py
diff --git a/benchmarks/nel/scripts/cli_clean_data.py b/benchmarks/nel/src/cli_clean_data.py
similarity index 100%
rename from benchmarks/nel/scripts/cli_clean_data.py
rename to benchmarks/nel/src/cli_clean_data.py
diff --git a/benchmarks/nel/scripts/cli_compare_evaluations.py b/benchmarks/nel/src/cli_compare_evaluations.py
similarity index 100%
rename from benchmarks/nel/scripts/cli_compare_evaluations.py
rename to benchmarks/nel/src/cli_compare_evaluations.py
diff --git a/benchmarks/nel/scripts/cli_compile_corpora.py b/benchmarks/nel/src/cli_compile_corpora.py
similarity index 95%
rename from benchmarks/nel/scripts/cli_compile_corpora.py
rename to benchmarks/nel/src/cli_compile_corpora.py
index 33a9de64d..14a0559dc 100644
--- a/benchmarks/nel/scripts/cli_compile_corpora.py
+++ b/benchmarks/nel/src/cli_compile_corpora.py
@@ -27,5 +27,5 @@ def main(dataset_name: str, language: str, model: str, use_filter_terms: bool =
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "en", "en_core_web_lg", True)
+    main("mewsli_9", "en", "en_core_web_lg", False)
     # typer.run(main)
diff --git a/benchmarks/nel/scripts/cli_evaluate.py b/benchmarks/nel/src/cli_evaluate.py
similarity index 100%
rename from benchmarks/nel/scripts/cli_evaluate.py
rename to benchmarks/nel/src/cli_evaluate.py
diff --git a/benchmarks/nel/scripts/cli_extract_annotations.py b/benchmarks/nel/src/cli_extract_annotations.py
similarity index 100%
rename from benchmarks/nel/scripts/cli_extract_annotations.py
rename to benchmarks/nel/src/cli_extract_annotations.py
diff --git a/benchmarks/nel/src/cli_retrieve_mentions_candidates.py b/benchmarks/nel/src/cli_retrieve_mentions_candidates.py
new file mode 100644
index 000000000..1600c3379
--- /dev/null
+++ b/benchmarks/nel/src/cli_retrieve_mentions_candidates.py
@@ -0,0 +1,16 @@
+"""Retrieve candidates for mentions in corpus."""
+import typer as typer
+
+from datasets.dataset import Dataset
+
+
+def main(dataset_name: str, language: str):
+    """Retrieve candidates for mentions in corpus.
+    dataset_name (str): Name of dataset to evaluate on.
+    language (str): Language.
+    """
+    Dataset.generate_from_id(dataset_name, language, "").retrieve_candidates_for_mentions()
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/src/custom_functions.py
similarity index 88%
rename from benchmarks/nel/scripts/custom_functions.py
rename to benchmarks/nel/src/custom_functions.py
index 100feaa53..e15cda1da 100644
--- a/benchmarks/nel/scripts/custom_functions.py
+++ b/benchmarks/nel/src/custom_functions.py
@@ -7,7 +7,7 @@
 from spacy.training import Example
 from spacy.pipeline import EntityLinker
 
-from wikid.scripts.kb import WikiKB
+from wikid.src.kb import WikiKB
 
 
 @spacy.registry.readers("EntityEnrichedCorpusReader.v1")
@@ -22,7 +22,7 @@ def read_docbin(_: Language) -> Iterable[Example]:
         NIL.
         nlp (Language): Pipeline to use for creating document used in EL from reference document.
         """
-        nlp = spacy.load(path_nlp_base, enable=["senter"])
+        nlp = spacy.load(path_nlp_base, enable=["senter"], config={"nlp.disabled": []})
 
         for doc in DocBin().from_disk(path).get_docs(nlp.vocab):
             pred_doc = nlp(doc.text)
@@ -55,7 +55,8 @@ def read_docbin(_: Language) -> Iterable[Example]:
         ID to NIL.
         nlp (Language): Pipeline to use for creating document used in EL from reference document.
         """
-        nlp = spacy.load(path_nlp_base, enable=["sentencizer"])
+        nlp = spacy.load(path_nlp_base)
+
         for example in spacy.training.Corpus(path)(nlp):
             example.predicted = nlp(example.predicted)
             example.predicted.ents = [
@@ -65,6 +66,18 @@ def read_docbin(_: Language) -> Iterable[Example]:
             sents = list(example.predicted.sents)
             sents_orig = list(example.reference.sents)
 
+            if len(sents) != len(sents_orig):
+                for i in range(max(len(sents), len(sents_orig))):
+                    if i < len(sents):
+                        print(sents[i])
+                    else:
+                        print("out")
+                    if i < len(sents_orig):
+                        print(sents_orig[i])
+                    else:
+                        print("out")
+                    print("-----")
+                x = 3
             assert len(sents) == len(sents_orig)
             assert len(sents) > 0 and len(sents_orig) > 0
             assert all([ent is not None for ent in example.predicted.ents])
diff --git a/benchmarks/nel/scripts/datasets/__init__.py b/benchmarks/nel/src/datasets/__init__.py
similarity index 100%
rename from benchmarks/nel/scripts/datasets/__init__.py
rename to benchmarks/nel/src/datasets/__init__.py
diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/src/datasets/dataset.py
similarity index 85%
rename from benchmarks/nel/scripts/datasets/dataset.py
rename to benchmarks/nel/src/datasets/dataset.py
index 33e8d02fd..3fa0483be 100644
--- a/benchmarks/nel/scripts/datasets/dataset.py
+++ b/benchmarks/nel/src/datasets/dataset.py
@@ -1,5 +1,6 @@
 """ Dataset class. """
 import abc
+import copy
 import csv
 import datetime
 import importlib
@@ -18,13 +19,12 @@
 import yaml
 from spacy import Language
 from spacy.kb import KnowledgeBase
-from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.tokens import Doc, DocBin
 from spacy.training import Example
 from spacy.pipeline import EntityLinker
 
 from wikid import schemas
-from wikid.scripts.kb import WikiKB
+from wikid.src.kb import WikiKB
 from . import evaluation
 from utils import get_logger
 
@@ -91,19 +91,18 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         with open(self._paths["annotations"], "rb") as file:
             self._annotations = pickle.load(file)
         Doc.set_extension("overlapping_annotations", default=None)
-        nlp_components = ["tok2vec", "tagger", "attribute_ruler"]
-        nlp = spacy.load(model, enable=nlp_components)
-        nlp.add_pipe("sentencizer")
+        nlp_components = ["tok2vec", "senter", "tagger", "attribute_ruler"]
+        nlp = spacy.load(model)  # , enable=nlp_components, config={"nlp.disabled": []}
 
         # Incorporate annotations from corpus into documents. Only keep docs with entities (relevant mostly when working
         # with filtered data).
-        self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)]
+        self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)][:500]
 
         # Serialize pipeline and corpora.
         self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True)
         nlp.to_disk(
             self._paths["nlp_base"],
-            exclude=[comp for comp in nlp.component_names if comp not in [*nlp_components, "sentencizer"]]
+            # exclude=[comp for comp in nlp.component_names if comp not in [*nlp_components]]
         )
         self._serialize_corpora()
 
@@ -365,3 +364,50 @@ def clean_assets(self) -> None:
         automatically.
         """
         raise NotImplementedError
+
+    def _collapse_spaces(self, doc_id: str, doc_text: str) -> Tuple[str, List[schemas.Annotation]]:
+        """
+        Replace multiple spaces with singles to avoid tokenization & sentence splitting issues later
+        in pipeline.
+        doc_id (str): Doc ID to be looked up in self._annotations.
+        doc_text (str): Doc text.
+        RETURNS (Annotation): Potentially updated (1) doc text, (2) annotations (start/end positions may have changed).
+        """
+        doc_annots = self._annotations.get(doc_id, [])
+        doc_text_orig = doc_text
+        annots_orig = copy.deepcopy(doc_annots)
+
+        # This is inefficient and could surely be optimized.
+        multi_space_start_idx = doc_text.find("  ")
+        while multi_space_start_idx >= 0:
+            multi_space_stop_idx = multi_space_start_idx + 2
+            while multi_space_stop_idx == " ":
+                multi_space_stop_idx += 1
+
+            # Shrink multiple whitespaces to one.
+            doc_text = doc_text[:multi_space_start_idx] + " " + doc_text[multi_space_stop_idx:]
+
+            # Adjust annotations indices accordingly.
+            for i, annot in enumerate(doc_annots):
+                annot_text_orig = doc_text_orig[annots_orig[i].start_pos:annots_orig[i].end_pos]
+                if multi_space_start_idx <= annot.start_pos < multi_space_stop_idx:
+                    offset = multi_space_start_idx + 1 - annot.start_pos
+                    annot.start_pos -= offset
+                    annot.end_pos -= offset
+                elif annot.start_pos >= multi_space_stop_idx:
+                    offset = multi_space_stop_idx - multi_space_start_idx - 1
+                    annot.start_pos -= offset
+                    annot.end_pos -= offset
+
+                # New annotation should match old one, except for leading/trailing spaces
+                assert doc_text[annot.start_pos:annot.end_pos].split() == " ".join(annot_text_orig.split())
+
+            multi_space_start_idx = doc_text.find("  ", multi_space_start_idx)
+
+        assert doc_text.find("  ") == -1
+
+        return doc_text, doc_annots
+
+    def retrieve_candidates_for_mentions(self) -> None:
+        """Retrieves candidates for all mentions in corpus."""
+        # todo load KB, read from annotations pickle, persist results
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/datasets/download_mewsli-9.sh b/benchmarks/nel/src/datasets/download_mewsli-9.sh
similarity index 100%
rename from benchmarks/nel/scripts/datasets/download_mewsli-9.sh
rename to benchmarks/nel/src/datasets/download_mewsli-9.sh
diff --git a/benchmarks/nel/scripts/datasets/evaluation.py b/benchmarks/nel/src/datasets/evaluation.py
similarity index 100%
rename from benchmarks/nel/scripts/datasets/evaluation.py
rename to benchmarks/nel/src/datasets/evaluation.py
diff --git a/benchmarks/nel/src/datasets/mewsli_9.py b/benchmarks/nel/src/datasets/mewsli_9.py
new file mode 100644
index 000000000..faa43700d
--- /dev/null
+++ b/benchmarks/nel/src/datasets/mewsli_9.py
@@ -0,0 +1,155 @@
+""" Dataset class for Mewsli-9 dataset. """
+import copy
+import csv
+import distutils.dir_util
+import pickle
+import time
+from typing import Tuple, Set, List, Dict, Optional
+
+import spacy
+import tqdm
+from spacy import Language
+from spacy.tokens import Doc
+
+from datasets.dataset import Dataset
+from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation
+from wikid import schemas, load_entities
+
+
+class Mewsli9Dataset(Dataset):
+    """Mewsli-9 dataset."""
+
+    @property
+    def name(self) -> str:
+        return "mewsli_9"
+
+    def _extract_annotations_from_corpus(
+        self, **kwargs
+    ) -> Dict[str, List[schemas.Annotation]]:
+        annotations: Dict[str, List[schemas.Annotation]] = {}
+
+        with open(
+            self._paths["assets"] / "clean" / "en" / "mentions.tsv", encoding="utf-8"
+        ) as file_path:
+            curr_article: Optional[str] = None
+            curr_docid: Optional[str] = None
+
+            for i, row in enumerate(csv.DictReader(file_path, delimiter="\t")):
+                assert len(row) == 9
+                if row["docid"] not in annotations:
+                    annotations[row["docid"]] = []
+
+                # Read article, if this annotation refers to a new entity.
+                if row["docid"] != curr_docid:
+                    curr_docid = row["docid"]
+                    curr_article = self._read_article_file(curr_docid)
+
+                # Correct leading/trailing whitespaces.
+                annot_start = int(row["position"])
+                annot_end = annot_start + int(row["length"])
+                while curr_article[annot_start] == " ":
+                    annot_start += 1
+                while curr_article[annot_end - 1] == " ":
+                    annot_end -= 1
+                annot_text = curr_article[annot_start:annot_end]
+                assert annot_text.startswith(" ") is False and annot_text.endswith(" ") is False
+
+                annotations[row["docid"]].append(
+                    schemas.Annotation(
+                        entity_name=row["url"].split("/")[-1].replace("_", " "),
+                        entity_id=row["qid"],
+                        start_pos=annot_start,
+                        end_pos=annot_end,
+                    )
+                )
+
+        return annotations
+
+    def clean_assets(self) -> None:
+        # No cleaning necessary, just copy all data into /clean.
+        distutils.dir_util.copy_tree(str(self._paths["assets"] / "raw"), str(self._paths["assets"] / "clean"))
+
+    def _read_article_file(self, doc_id: str) -> str:
+        """Reads article file for specified doc ID.
+        doc_id (str): Doc ID of article to read.
+        RETURNS (str): Article text as single string.
+        """
+        with open(
+            self._paths["assets"] / "clean" / "en" / "text" / doc_id,
+            encoding="utf-8",
+        ) as text_file:
+            # Replace newlines with whitespace and \xa0 (non-breaking whitespace) appearing after titles
+            # with a period. This maintains the correct offsets in the dataset annotations.
+            return "".join([
+                line.replace("\n", " ").replace("\xa0", ".") for line in text_file.readlines()
+            ])
+
+    def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]:
+        annotated_docs: List[Doc] = []
+
+        with open(
+            self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8"
+        ) as title_file:
+            row_count = sum(1 for _ in title_file)
+            title_file.seek(0)
+            n_annots_available = 0
+            n_annots_assigned = 0
+
+            # Load entities batched to avoid hitting max. number of parameters supported by SQLite.
+            batch_size = 2**14
+            qids = tuple({annot.entity_id for annots in self._annotations.values() for annot in annots})
+            entities = {
+                qid: entity_info
+                for entity_batch in
+                [
+                    load_entities(qids=qids[i:i + batch_size], language=self._language)
+                    for i in range(0, len(qids), batch_size)
+                ]
+                for qid, entity_info in entity_batch.items()
+            }
+
+            with tqdm.tqdm(
+                desc="Reading files", total=row_count, leave=False
+            ) as pbar:
+                docs_info_rows: List[Dict[str, str]] = []
+                doc_texts: List[str] = []
+                for row in csv.DictReader(title_file, delimiter="\t"):
+                    doc_text = self._read_article_file(row["docid"])
+                    if filter_terms and not any([ft in doc_text for ft in filter_terms]):
+                        pbar.update(1)
+                        continue
+                    docs_info_rows.append(row)
+                    doc_texts.append(doc_text)
+                    pbar.update(1)
+
+            docs = list(
+                nlp.pipe(
+                    tqdm.tqdm(
+                        doc_texts,
+                        desc="Creating doc objects",
+                        leave=False
+                    ),
+                    n_process=-1,
+                    batch_size=64,
+                )
+            )
+
+            # This is an embarrassingly parallel scenario - speed is fine for ~10k articles though.
+            with tqdm.tqdm(
+                desc="Extracting annotations", total=len(docs), leave=False
+            ) as pbar:
+                for doc, row in zip(docs, docs_info_rows):
+                    doc_annots = self._annotations.get(row["docid"], [])
+                    doc.ents, _ = create_spans_from_doc_annotation(
+                        doc=doc, entities_info=entities, annotations=doc_annots,
+                    )
+                    annotated_docs.append(doc)
+                    n_annots_available += len(doc_annots)
+                    n_annots_assigned += len(doc.ents)
+
+                    pbar.update(1)
+
+        print(f"Assigned {n_annots_assigned} out of {n_annots_available} annotations "
+              f"({(n_annots_assigned / n_annots_available * 100):.2f}%) in {pbar.n} docs.")
+
+        return annotated_docs
diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/src/datasets/utils.py
similarity index 100%
rename from benchmarks/nel/scripts/datasets/utils.py
rename to benchmarks/nel/src/datasets/utils.py
diff --git a/benchmarks/nel/scripts/parse_corpus.py b/benchmarks/nel/src/parse_corpus.py
similarity index 100%
rename from benchmarks/nel/scripts/parse_corpus.py
rename to benchmarks/nel/src/parse_corpus.py
diff --git a/benchmarks/nel/scripts/train.py b/benchmarks/nel/src/train.py
similarity index 60%
rename from benchmarks/nel/scripts/train.py
rename to benchmarks/nel/src/train.py
index eed760494..636d25762 100644
--- a/benchmarks/nel/scripts/train.py
+++ b/benchmarks/nel/src/train.py
@@ -13,11 +13,11 @@
         use_gpu=0,
         overrides={
             "paths.dataset_name": "mewsli_9",
-            "paths.train": "corpora/mewsli_9/train.spacy",
-            "paths.dev": "corpora/mewsli_9/dev.spacy",
-            "paths.kb": "wikid/output/en/kb",
-            "paths.db": "wikid/output/en/wiki.sqlite3",
-            "paths.base_nlp": "training/base-nlp/en",
+            "paths.train": str(root / "corpora/mewsli_9/train.spacy"),
+            "paths.dev": str(root / "corpora/mewsli_9/dev.spacy"),
+            "paths.kb": str(root / "wikid/output/en/kb"),
+            "paths.db": str(root / "wikid/output/en/wiki.sqlite3"),
+            "paths.base_nlp": str(root / "training/base-nlp/en"),
             "paths.language": "en",
             "training.max_steps": 10,
         }
diff --git a/benchmarks/nel/scripts/train.sh b/benchmarks/nel/src/train.sh
similarity index 86%
rename from benchmarks/nel/scripts/train.sh
rename to benchmarks/nel/src/train.sh
index 9b6a31744..c7c5aea38 100644
--- a/benchmarks/nel/scripts/train.sh
+++ b/benchmarks/nel/src/train.sh
@@ -9,7 +9,7 @@ gpu_id="${6:--1}"
 #   (4) config file name,
 #   (5) max. steps.
 #   (6) GPU information if GPU is to be used.
-PYTHONPATH='scripts' python -m spacy train configs/$4 \
+PYTHONPATH='src' python -m spacy train configs/$4 \
           --paths.dataset_name $1 \
           --output training/$1/$2 \
           --paths.train corpora/$1/train.spacy \
@@ -19,5 +19,5 @@ PYTHONPATH='scripts' python -m spacy train configs/$4 \
           --paths.base_nlp training/base-nlp/$3 \
           --paths.language $3 \
           --training.max_steps $5 \
-          -c scripts/custom_functions.py \
+          -c src/custom_functions.py \
           --gpu-id $gpu_id
\ No newline at end of file
diff --git a/benchmarks/nel/scripts/utils.py b/benchmarks/nel/src/utils.py
similarity index 100%
rename from benchmarks/nel/scripts/utils.py
rename to benchmarks/nel/src/utils.py

From c823add408a7d934d13bf456f6464ce231d278c3 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 6 Dec 2022 16:40:40 +0100
Subject: [PATCH 35/38] Fix NEL training.

---
 benchmarks/nel/project.yml                | 24 ++++++-------
 benchmarks/nel/src/cli_compile_corpora.py |  3 +-
 benchmarks/nel/src/cli_evaluate.py        |  4 +--
 benchmarks/nel/src/custom_functions.py    | 26 +++++++-------
 benchmarks/nel/src/datasets/dataset.py    | 44 ++++++++++++++++++++---
 benchmarks/nel/src/train.py               |  1 +
 benchmarks/nel/src/train.sh               |  1 +
 benchmarks/nel/test_nel_benchmark.py      |  1 +
 8 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index 1b66d1551..caa0c9c86 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -97,18 +97,6 @@ commands:
     outputs:
       - "assets/${vars.dataset}/annotations.pkl"
 
-  - name: retrieve_mentions_candidates
-    help: "Retrieve candidates for mentions in corpus and persist them in file."
-    script:
-      - "env PYTHONPATH=. python ./src/cli_retrieve_mentions_candidates.py ${vars.dataset} ${vars.language}"
-    deps:
-      - "wikid/output/${vars.language}/kb"
-      - "wikid/output/${vars.language}/wiki.annoy"
-      - "wikid/output/${vars.language}/wiki.sqlite3"
-      - "assets/${vars.dataset}/annotations.pkl"
-    outputs:
-      - "corpora/${vars.dataset}/mention_candidates.pkl"
-
   - name: compile_corpora
     help: "Compile corpora, separated in train/dev/test sets."
     script:
@@ -122,6 +110,18 @@ commands:
       - "corpora/${vars.dataset}/dev.spacy"
       - "corpora/${vars.dataset}/test.spacy"
 
+  - name: retrieve_mentions_candidates
+    help: "Retrieve candidates for mentions in corpus and persist them in file. This is an optional step, but speeds up training and evaluation"
+    script:
+      - "env PYTHONPATH=. python ./src/cli_retrieve_mentions_candidates.py ${vars.dataset} ${vars.language}"
+    deps:
+      - "wikid/output/${vars.language}/kb"
+      - "wikid/output/${vars.language}/wiki.annoy"
+      - "wikid/output/${vars.language}/wiki.sqlite3"
+      - "assets/${vars.dataset}/annotations.pkl"
+    outputs:
+      - "corpora/${vars.dataset}/mention_candidates.pkl"
+
   - name: train
     help: "Train a new Entity Linking component. Pass --vars.gpu_id GPU_ID to train with GPU. Training with some datasets may take a long time!"
     script:
diff --git a/benchmarks/nel/src/cli_compile_corpora.py b/benchmarks/nel/src/cli_compile_corpora.py
index 14a0559dc..4b682ee8b 100644
--- a/benchmarks/nel/src/cli_compile_corpora.py
+++ b/benchmarks/nel/src/cli_compile_corpora.py
@@ -27,5 +27,4 @@ def main(dataset_name: str, language: str, model: str, use_filter_terms: bool =
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "en", "en_core_web_lg", False)
-    # typer.run(main)
+    typer.run(main)
diff --git a/benchmarks/nel/src/cli_evaluate.py b/benchmarks/nel/src/cli_evaluate.py
index 475b43850..411c269c7 100644
--- a/benchmarks/nel/src/cli_evaluate.py
+++ b/benchmarks/nel/src/cli_evaluate.py
@@ -17,5 +17,5 @@ def main(dataset_name: str, run_name: str, language: str, gpu_id: Optional[int]
 
 
 if __name__ == "__main__":
-    main("mewsli_9", "default", "en", 0)
-    # typer.run(main)
+    # main("mewsli_9", "default", "en", 0)
+    typer.run(main)
diff --git a/benchmarks/nel/src/custom_functions.py b/benchmarks/nel/src/custom_functions.py
index e15cda1da..e0bc2993e 100644
--- a/benchmarks/nel/src/custom_functions.py
+++ b/benchmarks/nel/src/custom_functions.py
@@ -66,18 +66,19 @@ def read_docbin(_: Language) -> Iterable[Example]:
             sents = list(example.predicted.sents)
             sents_orig = list(example.reference.sents)
 
-            if len(sents) != len(sents_orig):
-                for i in range(max(len(sents), len(sents_orig))):
-                    if i < len(sents):
-                        print(sents[i])
-                    else:
-                        print("out")
-                    if i < len(sents_orig):
-                        print(sents_orig[i])
-                    else:
-                        print("out")
-                    print("-----")
-                x = 3
+            # if len(sents) != len(sents_orig):
+            #     for i in range(max(len(sents), len(sents_orig))):
+            #         if i < len(sents):
+            #             print(sents[i])
+            #         else:
+            #             print("out")
+            #         if i < len(sents_orig):
+            #             print(sents_orig[i])
+            #         else:
+            #             print("out")
+            #         print("-----")
+            #     x = 3
+
             assert len(sents) == len(sents_orig)
             assert len(sents) > 0 and len(sents_orig) > 0
             assert all([ent is not None for ent in example.predicted.ents])
@@ -93,6 +94,7 @@ def read_docbin(_: Language) -> Iterable[Example]:
 def load_kb(kb_path: Path) -> Callable[[Vocab], WikiKB]:
     """Loads WikiKB instance from disk.
     kb_path (Path): Path to WikiKB path.
+    mention_candidates_path (Path): Path to pre-computed file with candidates per mention.
     RETURNS (Callable[[Vocab], WikiKB]): Callable generating WikiKB from disk.
     """
     def kb_from_file(_: Vocab) -> WikiKB:
diff --git a/benchmarks/nel/src/datasets/dataset.py b/benchmarks/nel/src/datasets/dataset.py
index 3fa0483be..2c155b57a 100644
--- a/benchmarks/nel/src/datasets/dataset.py
+++ b/benchmarks/nel/src/datasets/dataset.py
@@ -10,7 +10,7 @@
 import pickle
 from collections import defaultdict
 from pathlib import Path
-from typing import Tuple, Set, List, Optional, TypeVar, Type, Dict, Union
+from typing import Tuple, Set, List, Optional, TypeVar, Type, Dict
 
 import numpy
 import prettytable
@@ -24,7 +24,7 @@
 from spacy.pipeline import EntityLinker
 
 from wikid import schemas
-from wikid.src.kb import WikiKB
+from wikid.src.kb import WikiKB, WikiKBCandidate
 from . import evaluation
 from utils import get_logger
 
@@ -74,7 +74,8 @@ def assemble_paths(dataset_name: str, run_name: str, language: str) -> Dict[str,
             "annotations": assets_path / "annotations.pkl",
             "nlp_base": root_path / "training" / "base-nlp" / language,
             "nlp_best": root_path / "training" / dataset_name / run_name / "model-best",
-            "corpora": root_path / "corpora" / dataset_name
+            "corpora": root_path / "corpora" / dataset_name,
+            "mentions_candidates": root_path / "corpora" / dataset_name / "mentions_candidates.pkl",
         }
 
     @property
@@ -410,4 +411,39 @@ def _collapse_spaces(self, doc_id: str, doc_text: str) -> Tuple[str, List[schema
 
     def retrieve_candidates_for_mentions(self) -> None:
         """Retrieves candidates for all mentions in corpus."""
-        # todo load KB, read from annotations pickle, persist results
\ No newline at end of file
+        logger.info(f"Retrieving candidates for all mentions in corpus.")
+
+        self._kb = WikiKB.generate_from_disk(self._paths["kb"])
+        # This is done to ensure KB is not using outdated mention_candidates map (which happens if the current step was
+        # already done and is now repeated).
+        self._kb._mentions_candidates = None
+
+        # Our entity corpora incorporate annotated mentions as in their .ents attributes at this point, so we can
+        # extract all mentions from there.
+        mentions = {
+            ent.text: ent
+            for corpus_name in ("train", "dev", "test")
+            for doc in DocBin(
+            ).from_disk(self._paths["corpora"] / (corpus_name + ".spacy")).get_docs(self._kb.vocab)
+            for ent in doc.ents
+        }
+        mention_texts = list(mentions.keys())
+
+        # Select candidates.
+        mention_candidates: Dict[str, List[WikiKBCandidate]] = {
+            mention_text: mention_candidates
+            for mention_text, mention_candidates in zip(
+                mention_texts,
+                list(self._kb.get_candidates_all([[mentions[mt] for mt in mention_texts]]))[0]
+            )
+        }
+        for mention in mention_candidates:
+            assert all([mention == mc.mention for mc in mention_candidates[mention]])
+
+        # Store results.
+        self._paths["mentions_candidates"].parent.mkdir(parents=True, exist_ok=True)
+        with open(self._paths["mentions_candidates"], "wb") as file:
+            pickle.dump(mention_candidates, file)
+        # Update hash in KB, persist updated KB.
+        self._kb.update_path("mentions_candidates", self._paths["mentions_candidates"])
+        self._kb.to_disk(self._paths["kb"])
diff --git a/benchmarks/nel/src/train.py b/benchmarks/nel/src/train.py
index 636d25762..7dc20b2f0 100644
--- a/benchmarks/nel/src/train.py
+++ b/benchmarks/nel/src/train.py
@@ -18,6 +18,7 @@
             "paths.kb": str(root / "wikid/output/en/kb"),
             "paths.db": str(root / "wikid/output/en/wiki.sqlite3"),
             "paths.base_nlp": str(root / "training/base-nlp/en"),
+            "paths.mentions_candidates": str(root / "corpora" / "mewsli_9" / "mentions_candidates.pkl"),
             "paths.language": "en",
             "training.max_steps": 10,
         }
diff --git a/benchmarks/nel/src/train.sh b/benchmarks/nel/src/train.sh
index c7c5aea38..c0c3ff0db 100644
--- a/benchmarks/nel/src/train.sh
+++ b/benchmarks/nel/src/train.sh
@@ -17,6 +17,7 @@ PYTHONPATH='src' python -m spacy train configs/$4 \
           --paths.kb wikid/output/$3/kb \
           --paths.db wikid/output/$3/wiki.sqlite3 \
           --paths.base_nlp training/base-nlp/$3 \
+          --paths.mentions_candidates corpora/$1/mentions_candidates.pkl \
           --paths.language $3 \
           --training.max_steps $5 \
           -c src/custom_functions.py \
diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py
index d6aca8e47..55be2f3cf 100644
--- a/benchmarks/nel/test_nel_benchmark.py
+++ b/benchmarks/nel/test_nel_benchmark.py
@@ -26,6 +26,7 @@ def test_nel_benchmark():
         os.environ[overrides_key] = overrides
     project_run(root, "extract_annotations", capture=True)
     project_run(root, "compile_corpora", capture=True)
+    project_run(root, "retrieve_mentions_candidates", capture=True)
     project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1, "vars.training_max_epochs": 1})
     project_run(root, "evaluate", capture=True)
     project_run(root, "compare_evaluations", capture=True)

From ff6a3d3f6a312530a9e491df5c71836b2a20d2e3 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 7 Dec 2022 13:15:21 +0100
Subject: [PATCH 36/38] Update comments and config.

---
 benchmarks/nel/configs/nel.cfg          | 6 +++---
 benchmarks/nel/project.yml              | 2 +-
 benchmarks/nel/src/datasets/dataset.py  | 9 ++++-----
 benchmarks/nel/src/datasets/mewsli_9.py | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index a9b806389..21d9ba355 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -90,11 +90,11 @@ dev_corpus = corpora.dev
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.2
-patience = 10000
+patience = 1000000
 eval_frequency = 200
 accumulate_gradient = 2
-max_epochs = 0
-max_steps = 500
+max_epochs = 1000
+max_steps = 1000
 annotating_components = []
 frozen_components = []
 before_to_disk = null
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index caa0c9c86..c2e94b227 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -10,7 +10,7 @@ vars:
   gpu_id: ""
   download_all_wiki_assets: ""  # "--extra" to download full Wiki dumps.
   filter: "True"  # Whether to only use parts of Wiki data and corpus containing filter terms.
-  training_max_steps: 10
+  training_max_steps: 1000
   eval_highlight_metric: "F"  # one of ("F", "r", "p")
 
 directories: ["assets", "training", "configs", "src", "corpora", "evaluation"]
diff --git a/benchmarks/nel/src/datasets/dataset.py b/benchmarks/nel/src/datasets/dataset.py
index 2c155b57a..dfc93a72c 100644
--- a/benchmarks/nel/src/datasets/dataset.py
+++ b/benchmarks/nel/src/datasets/dataset.py
@@ -92,12 +92,11 @@ def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -
         with open(self._paths["annotations"], "rb") as file:
             self._annotations = pickle.load(file)
         Doc.set_extension("overlapping_annotations", default=None)
-        nlp_components = ["tok2vec", "senter", "tagger", "attribute_ruler"]
-        nlp = spacy.load(model)  # , enable=nlp_components, config={"nlp.disabled": []}
+        nlp = spacy.load(model)
 
         # Incorporate annotations from corpus into documents. Only keep docs with entities (relevant mostly when working
         # with filtered data).
-        self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)][:500]
+        self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)]
 
         # Serialize pipeline and corpora.
         self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True)
@@ -414,8 +413,8 @@ def retrieve_candidates_for_mentions(self) -> None:
         logger.info(f"Retrieving candidates for all mentions in corpus.")
 
         self._kb = WikiKB.generate_from_disk(self._paths["kb"])
-        # This is done to ensure KB is not using outdated mention_candidates map (which happens if the current step was
-        # already done and is now repeated).
+        # Ensure KB is not using outdated mention_candidates map (which happens if the current step was already done and
+        # is now repeated).
         self._kb._mentions_candidates = None
 
         # Our entity corpora incorporate annotated mentions as in their .ents attributes at this point, so we can
diff --git a/benchmarks/nel/src/datasets/mewsli_9.py b/benchmarks/nel/src/datasets/mewsli_9.py
index faa43700d..0b1794773 100644
--- a/benchmarks/nel/src/datasets/mewsli_9.py
+++ b/benchmarks/nel/src/datasets/mewsli_9.py
@@ -136,7 +136,7 @@ def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]]
 
             # This is an embarrassingly parallel scenario - speed is fine for ~10k articles though.
             with tqdm.tqdm(
-                desc="Extracting annotations", total=len(docs), leave=False
+                desc="Parsing annotations", total=len(docs), leave=False
             ) as pbar:
                 for doc, row in zip(docs, docs_info_rows):
                     doc_annots = self._annotations.get(row["docid"], [])

From db2504254f4814a542ceacdcf94221f076cc878d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 7 Dec 2022 16:10:47 +0100
Subject: [PATCH 37/38] Clean up and adjust config.

---
 benchmarks/nel/configs/nel.cfg         | 10 +++---
 benchmarks/nel/project.yml             |  2 +-
 benchmarks/nel/src/custom_functions.py | 47 --------------------------
 3 files changed, 6 insertions(+), 53 deletions(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 21d9ba355..60028b0d6 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -74,12 +74,12 @@ kb_path = ${paths.kb}
 [corpora]
 
 [corpora.train]
-@readers = "EntityEnrichedCorpusReader.v2"
+@readers = "EntityEnrichedCorpusReader.v1"
 path = ${paths.train}
 path_nlp_base = ${paths.vectors}
 
 [corpora.dev]
-@readers = "EntityEnrichedCorpusReader.v2"
+@readers = "EntityEnrichedCorpusReader.v1"
 path = ${paths.dev}
 path_nlp_base = ${paths.vectors}
 
@@ -90,11 +90,11 @@ dev_corpus = corpora.dev
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.2
-patience = 1000000
+patience = 10000
 eval_frequency = 200
 accumulate_gradient = 2
-max_epochs = 1000
-max_steps = 1000
+max_epochs = 25
+max_steps = 10000
 annotating_components = []
 frozen_components = []
 before_to_disk = null
diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml
index c2e94b227..de4bc0edb 100644
--- a/benchmarks/nel/project.yml
+++ b/benchmarks/nel/project.yml
@@ -10,7 +10,7 @@ vars:
   gpu_id: ""
   download_all_wiki_assets: ""  # "--extra" to download full Wiki dumps.
   filter: "True"  # Whether to only use parts of Wiki data and corpus containing filter terms.
-  training_max_steps: 1000
+  training_max_steps: 10000
   eval_highlight_metric: "F"  # one of ("F", "r", "p")
 
 directories: ["assets", "training", "configs", "src", "corpora", "evaluation"]
diff --git a/benchmarks/nel/src/custom_functions.py b/benchmarks/nel/src/custom_functions.py
index e0bc2993e..1b999972b 100644
--- a/benchmarks/nel/src/custom_functions.py
+++ b/benchmarks/nel/src/custom_functions.py
@@ -3,7 +3,6 @@
 
 import spacy
 from spacy import registry, Vocab, Language
-from spacy.tokens import DocBin
 from spacy.training import Example
 from spacy.pipeline import EntityLinker
 
@@ -11,39 +10,6 @@
 
 
 @spacy.registry.readers("EntityEnrichedCorpusReader.v1")
-def create_docbin_reader(path: Path, path_nlp_base: Path) -> Callable[[Language], Iterable[Example]]:
-    """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
-    specified in the corpus annotations.
-    path (Path): Path to DocBin file with documents to prepare.
-    path_nlp_base (Path): Path to pipeline for tokenization/sentence.
-    """
-    def read_docbin(_: Language) -> Iterable[Example]:
-        """Read DocBin for training. Set all entities as they appear in the annotated corpus, but set entity type to
-        NIL.
-        nlp (Language): Pipeline to use for creating document used in EL from reference document.
-        """
-        nlp = spacy.load(path_nlp_base, enable=["senter"], config={"nlp.disabled": []})
-
-        for doc in DocBin().from_disk(path).get_docs(nlp.vocab):
-            pred_doc = nlp(doc.text)
-            pred_doc.ents = [
-                pred_doc.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL)
-                for ent in doc.ents
-            ]
-            sents = list(pred_doc.sents)
-            sents_orig = list(doc.sents)
-            assert len(sents) == len(sents_orig)
-            assert len(sents) > 0 and len(sents_orig) > 0
-            assert all([ent is not None for ent in pred_doc.ents])
-            assert len(doc.ents) == len(pred_doc.ents)
-            assert len(doc.ents) > 0
-
-            yield Example(pred_doc, doc)
-
-    return read_docbin
-
-
-@spacy.registry.readers("EntityEnrichedCorpusReader.v2")
 def create_docbin_reader(path: Path, path_nlp_base: Path) -> Callable[[Language], Iterable[Example]]:
     """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as
     specified in the corpus annotations.
@@ -66,19 +32,6 @@ def read_docbin(_: Language) -> Iterable[Example]:
             sents = list(example.predicted.sents)
             sents_orig = list(example.reference.sents)
 
-            # if len(sents) != len(sents_orig):
-            #     for i in range(max(len(sents), len(sents_orig))):
-            #         if i < len(sents):
-            #             print(sents[i])
-            #         else:
-            #             print("out")
-            #         if i < len(sents_orig):
-            #             print(sents_orig[i])
-            #         else:
-            #             print("out")
-            #         print("-----")
-            #     x = 3
-
             assert len(sents) == len(sents_orig)
             assert len(sents) > 0 and len(sents_orig) > 0
             assert all([ent is not None for ent in example.predicted.ents])

From 946b5ee531d270638a2115a8239d7e3cd4714a88 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 19 Jan 2023 15:35:57 +0100
Subject: [PATCH 38/38] Remove unused parameter 'db' from nel.cfg.

---
 benchmarks/nel/configs/nel.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg
index 60028b0d6..d2cdc56c7 100644
--- a/benchmarks/nel/configs/nel.cfg
+++ b/benchmarks/nel/configs/nel.cfg
@@ -6,7 +6,6 @@ dev = ""
 raw = null
 init_tok2vec = null
 kb = ""
-db = ""
 base_nlp = ""
 vectors = "${paths.base_nlp}"