Skip to content

Commit 0e84319

Browse files
added some fine-tuning to most recent PR by Tom Aarsen
1 parent f0e7db5 commit 0e84319

2 files changed

Lines changed: 47 additions & 17 deletions

File tree

concise_concepts/conceptualizer/Conceptualizer.py

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,12 @@ def __init__(
8282
"PART",
8383
"PRON",
8484
]
85-
self.match_rule["POS"] = {"NOT_IN": exclude_pos}
85+
if exclude_pos:
86+
self.match_rule["POS"] = {"NOT_IN": exclude_pos}
8687
if exclude_dep is None:
8788
exclude_dep = []
88-
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
89+
if exclude_dep:
90+
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
8991
self.json_path = json_path
9092
self.check_validity_path()
9193
self.include_compound_words = include_compound_words
@@ -102,7 +104,7 @@ def __init__(
102104
self.run()
103105
self.data_upper = {k.upper(): v for k, v in data.items()}
104106

105-
def run(self):
107+
def run(self) -> None:
106108
self.check_validity_path()
107109
self.determine_topn()
108110
self.set_gensim_model()
@@ -119,7 +121,11 @@ def run(self):
119121
if not self.ent_score:
120122
del self.kv
121123

122-
def check_validity_path(self):
124+
def check_validity_path(self) -> None:
125+
"""
126+
If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the
127+
directory and set the path to the default file name
128+
"""
123129
if self.json_path:
124130
if Path(self.json_path).suffix:
125131
Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True)
@@ -132,7 +138,7 @@ def check_validity_path(self):
132138
f" ´json_path´to {self.json_path}"
133139
)
134140

135-
def determine_topn(self):
141+
def determine_topn(self) -> None:
136142
"""
137143
If the user doesn't specify a topn value for each class,
138144
then the topn value for each class is set to 100
@@ -146,7 +152,7 @@ def determine_topn(self):
146152
), f"Provide a topn integer for each of the {num_classes} classes."
147153
self.topn_dict = dict(zip(self.data, self.topn))
148154

149-
def set_gensim_model(self):
155+
def set_gensim_model(self) -> None:
150156
"""
151157
If the model_path is not None, then we try to load the model from the path.
152158
If it's not a valid path, then we raise an exception.
@@ -186,7 +192,7 @@ def set_gensim_model(self):
186192

187193
self.kv.add_vectors(wordList, vectorList)
188194

189-
def verify_data(self, verbose: bool = True):
195+
def verify_data(self, verbose: bool = True) -> None:
190196
"""
191197
It takes a dictionary of lists of words, and returns a dictionary of lists of words,
192198
where each word in the list is present in the word2vec model
@@ -222,7 +228,7 @@ def verify_data(self, verbose: bool = True):
222228
raise Exception(msg)
223229
self.data = deepcopy(verified_data)
224230

225-
def expand_concepts(self):
231+
def expand_concepts(self) -> None:
226232
"""
227233
For each key in the data dictionary, find the topn most similar words to the key and the values in the data
228234
dictionary, and add those words to the values in the data dictionary
@@ -244,7 +250,7 @@ def expand_concepts(self):
244250
{self.check_presence_vocab(word) for word, _ratio in similar}
245251
)
246252

247-
def resolve_overlapping_concepts(self):
253+
def resolve_overlapping_concepts(self) -> None:
248254
"""
249255
It removes words from the data that are in other concepts, and then removes words that are not closest to the
250256
centroid of the concept
@@ -256,7 +262,7 @@ def resolve_overlapping_concepts(self):
256262
if key == self.kv.most_similar_to_given(word, list(self.data.keys()))
257263
]
258264

259-
def infer_original_data(self):
265+
def infer_original_data(self) -> None:
260266
"""
261267
It takes the original data and adds the new data to it, then removes the new data from the original data.
262268
"""
@@ -272,7 +278,7 @@ def infer_original_data(self):
272278
if word not in self.original_data[key_y]
273279
]
274280

275-
def lemmatize_concepts(self):
281+
def lemmatize_concepts(self) -> None:
276282
"""
277283
For each key in the data dictionary,
278284
the function takes the list of concepts associated with that key, and lemmatizes
@@ -283,7 +289,7 @@ def lemmatize_concepts(self):
283289
set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])])
284290
)
285291

286-
def create_conceptual_patterns(self):
292+
def create_conceptual_patterns(self) -> None:
287293
"""
288294
For each key in the data dictionary,
289295
create a pattern for each word in the list of words associated with that key.
@@ -311,7 +317,13 @@ def create_conceptual_patterns(self):
311317
"""
312318
patterns = []
313319

314-
def add_patterns(input_dict):
320+
def add_patterns(input_dict: dict) -> None:
321+
"""
322+
It creates a list of dictionaries that can be used for a spaCy entity ruler
323+
324+
:param input_dict: a dictionary
325+
:type input_dict: dict
326+
"""
315327
for key in input_dict:
316328
if self.match_key == "LEMMA":
317329
words = [
@@ -383,7 +395,7 @@ def add_patterns(input_dict):
383395
self.ruler = self.nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
384396
self.ruler.add_patterns(patterns)
385397

386-
def __call__(self, doc: Doc):
398+
def __call__(self, doc: Doc) -> Doc:
387399
"""
388400
It takes a doc object and assigns a score to each entity in the doc object
389401
@@ -394,7 +406,7 @@ def __call__(self, doc: Doc):
394406
doc = self.assign_score_to_entities(doc)
395407
return doc
396408

397-
def pipe(self, stream, batch_size=128):
409+
def pipe(self, stream, batch_size=128) -> Doc:
398410
"""
399411
It takes a stream of documents, and for each document,
400412
it assigns a score to each entity in the document
@@ -408,7 +420,7 @@ def pipe(self, stream, batch_size=128):
408420
doc = self.assign_score_to_entities(doc)
409421
yield doc
410422

411-
def assign_score_to_entities(self, doc: Doc):
423+
def assign_score_to_entities(self, doc: Doc) -> Doc:
412424
"""
413425
The function takes a spaCy document as input and assigns a score to each entity in the document. The score is
414426
calculated using the word embeddings of the entity and the concept.
@@ -460,6 +472,14 @@ def assign_score_to_entities(self, doc: Doc):
460472
return doc
461473

462474
def _check_presence_vocab(self, word: str) -> str:
475+
"""
476+
If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and
477+
check if the new word is in the vocabulary. If so, return the new word
478+
479+
:param word: str - the word to check
480+
:type word: str
481+
:return: The word or the check_word
482+
"""
463483
if word in self.kv:
464484
return word
465485
for op in [" ", "-"]:
@@ -468,6 +488,16 @@ def _check_presence_vocab(self, word: str) -> str:
468488
return check_word
469489

470490
def check_presence_vocab(self, word: str) -> str:
491+
"""
492+
If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of
493+
the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word
494+
itself
495+
496+
:param word: The word to check for presence in the vocabulary
497+
:type word: str
498+
:return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of
499+
being the word that was intended.
500+
"""
471501
if not word.islower() and not self.case_sensitive:
472502
present_word = self._check_presence_vocab(word.lower())
473503
if present_word:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "concise-concepts"
3-
version = "0.6.3"
3+
version = "0.7"
44
description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
55
authors = ["David Berenstein <david.m.berenstein@gmail.com>"]
66
license = "MIT"

0 commit comments

Comments
 (0)