@@ -82,10 +82,12 @@ def __init__(
8282 "PART" ,
8383 "PRON" ,
8484 ]
85- self .match_rule ["POS" ] = {"NOT_IN" : exclude_pos }
85+ if exclude_pos :
86+ self .match_rule ["POS" ] = {"NOT_IN" : exclude_pos }
8687 if exclude_dep is None :
8788 exclude_dep = []
88- self .match_rule ["DEP" ] = {"NOT_IN" : exclude_dep }
89+ if exclude_dep :
90+ self .match_rule ["DEP" ] = {"NOT_IN" : exclude_dep }
8991 self .json_path = json_path
9092 self .check_validity_path ()
9193 self .include_compound_words = include_compound_words
@@ -102,7 +104,7 @@ def __init__(
102104 self .run ()
103105 self .data_upper = {k .upper (): v for k , v in data .items ()}
104106
105- def run (self ):
107+ def run (self ) -> None :
106108 self .check_validity_path ()
107109 self .determine_topn ()
108110 self .set_gensim_model ()
@@ -119,7 +121,11 @@ def run(self):
119121 if not self .ent_score :
120122 del self .kv
121123
122- def check_validity_path (self ):
124+ def check_validity_path (self ) -> None :
125+ """
126+ If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the
127+ directory and set the path to the default file name
128+ """
123129 if self .json_path :
124130 if Path (self .json_path ).suffix :
125131 Path (self .json_path ).parents [0 ].mkdir (parents = True , exist_ok = True )
@@ -132,7 +138,7 @@ def check_validity_path(self):
132138 f" ´json_path´to { self .json_path } "
133139 )
134140
135- def determine_topn (self ):
141+ def determine_topn (self ) -> None :
136142 """
137143 If the user doesn't specify a topn value for each class,
138144 then the topn value for each class is set to 100
@@ -146,7 +152,7 @@ def determine_topn(self):
146152 ), f"Provide a topn integer for each of the { num_classes } classes."
147153 self .topn_dict = dict (zip (self .data , self .topn ))
148154
149- def set_gensim_model (self ):
155+ def set_gensim_model (self ) -> None :
150156 """
151157 If the model_path is not None, then we try to load the model from the path.
152158 If it's not a valid path, then we raise an exception.
@@ -186,7 +192,7 @@ def set_gensim_model(self):
186192
187193 self .kv .add_vectors (wordList , vectorList )
188194
189- def verify_data (self , verbose : bool = True ):
195+ def verify_data (self , verbose : bool = True ) -> None :
190196 """
191197 It takes a dictionary of lists of words, and returns a dictionary of lists of words,
192198 where each word in the list is present in the word2vec model
@@ -222,7 +228,7 @@ def verify_data(self, verbose: bool = True):
222228 raise Exception (msg )
223229 self .data = deepcopy (verified_data )
224230
225- def expand_concepts (self ):
231+ def expand_concepts (self ) -> None :
226232 """
227233 For each key in the data dictionary, find the topn most similar words to the key and the values in the data
228234 dictionary, and add those words to the values in the data dictionary
@@ -244,7 +250,7 @@ def expand_concepts(self):
244250 {self .check_presence_vocab (word ) for word , _ratio in similar }
245251 )
246252
247- def resolve_overlapping_concepts (self ):
253+ def resolve_overlapping_concepts (self ) -> None :
248254 """
249255 It removes words from the data that are in other concepts, and then removes words that are not closest to the
250256 centroid of the concept
@@ -256,7 +262,7 @@ def resolve_overlapping_concepts(self):
256262 if key == self .kv .most_similar_to_given (word , list (self .data .keys ()))
257263 ]
258264
259- def infer_original_data (self ):
265+ def infer_original_data (self ) -> None :
260266 """
261267 It takes the original data and adds the new data to it, then removes the new data from the original data.
262268 """
@@ -272,7 +278,7 @@ def infer_original_data(self):
272278 if word not in self .original_data [key_y ]
273279 ]
274280
275- def lemmatize_concepts (self ):
281+ def lemmatize_concepts (self ) -> None :
276282 """
277283 For each key in the data dictionary,
278284 the function takes the list of concepts associated with that key, and lemmatizes
@@ -283,7 +289,7 @@ def lemmatize_concepts(self):
283289 set ([doc [0 ].lemma_ for doc in self .nlp .pipe (self .data [key ])])
284290 )
285291
286- def create_conceptual_patterns (self ):
292+ def create_conceptual_patterns (self ) -> None :
287293 """
288294 For each key in the data dictionary,
289295 create a pattern for each word in the list of words associated with that key.
@@ -311,7 +317,13 @@ def create_conceptual_patterns(self):
311317 """
312318 patterns = []
313319
314- def add_patterns (input_dict ):
320+ def add_patterns (input_dict : dict ) -> None :
321+ """
322+ It creates a list of dictionaries that can be used for a spaCy entity ruler
323+
324+ :param input_dict: a dictionary
325+ :type input_dict: dict
326+ """
315327 for key in input_dict :
316328 if self .match_key == "LEMMA" :
317329 words = [
@@ -383,7 +395,7 @@ def add_patterns(input_dict):
383395 self .ruler = self .nlp .add_pipe ("entity_ruler" , config = {"overwrite_ents" : True })
384396 self .ruler .add_patterns (patterns )
385397
386- def __call__ (self , doc : Doc ):
398+ def __call__ (self , doc : Doc ) -> Doc :
387399 """
388400 It takes a doc object and assigns a score to each entity in the doc object
389401
@@ -394,7 +406,7 @@ def __call__(self, doc: Doc):
394406 doc = self .assign_score_to_entities (doc )
395407 return doc
396408
397- def pipe (self , stream , batch_size = 128 ):
409+ def pipe (self , stream , batch_size = 128 ) -> Doc :
398410 """
399411 It takes a stream of documents, and for each document,
400412 it assigns a score to each entity in the document
@@ -408,7 +420,7 @@ def pipe(self, stream, batch_size=128):
408420 doc = self .assign_score_to_entities (doc )
409421 yield doc
410422
411- def assign_score_to_entities (self , doc : Doc ):
423+ def assign_score_to_entities (self , doc : Doc ) -> Doc :
412424 """
413425 The function takes a spaCy document as input and assigns a score to each entity in the document. The score is
414426 calculated using the word embeddings of the entity and the concept.
@@ -460,6 +472,14 @@ def assign_score_to_entities(self, doc: Doc):
460472 return doc
461473
462474 def _check_presence_vocab (self , word : str ) -> str :
475+ """
476+ If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and
477+ check if the new word is in the vocabulary. If so, return the new word
478+
479+ :param word: str - the word to check
480+ :type word: str
481+ :return: The word or the check_word
482+ """
463483 if word in self .kv :
464484 return word
465485 for op in [" " , "-" ]:
@@ -468,6 +488,16 @@ def _check_presence_vocab(self, word: str) -> str:
468488 return check_word
469489
470490 def check_presence_vocab (self , word : str ) -> str :
491+ """
492+ If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of
493+ the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word
494+ itself
495+
496+ :param word: The word to check for presence in the vocabulary
497+ :type word: str
498+ :return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of
499+ being the word that was intended.
500+ """
471501 if not word .islower () and not self .case_sensitive :
472502 present_word = self ._check_presence_vocab (word .lower ())
473503 if present_word :
0 commit comments