-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch_finder.py
67 lines (53 loc) · 2.48 KB
/
match_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from nltk.util import ngrams
from pathlib import Path
import fuzzyset
from datetime import datetime
def get_ngrams(text: str, n: int) -> list:
"""
:rtype: list of strings
"""
n_grams = ngrams(text.split(), n)
return [' '.join(grams) for grams in n_grams]
def get_ngram_of_text(text: str, min_n: int, max_n: int = None, path_file: bool = False) -> list:
"""
This function receives text file path and range (by min & nax values) of n's
and returns list with all n-grams for any n in [min_n, max_n]
:param text: the input text or the path file
:param path_file: True if text is path of text file,
:param min_n: the shortest n-grams
:param max_n: the longest ngrams, if only one length needed - keep on None
:return: list of n-grams form text in all lengths.
"""
if path_file:
with open(text, "r", encoding="utf-8") as f:
text = f.read()
list_of_ngrams = [get_ngrams(text, n) for n in range(min_n, max_n + 1)]
return [item for ngrams in list_of_ngrams for item in ngrams]
def get_list_of_text_files_in_dir(path: str) -> list:
corpus_dir = Path(path)
text_files_paths = list(corpus_dir.glob('*.txt'))
print(f"There are {len(text_files_paths)} text files in the corpus.")
return [txt_file.stem for txt_file in list(text_files_paths)]
def init_fuzzy_set_on_ngrams(text_file_path: str, min_n: int, max_n: int) -> fuzzyset.FuzzySet:
st = datetime.now()
print(text_file_path)
fuzzy_set = fuzzyset.FuzzySet()
ngram_of_text_file = get_ngram_of_text(text_file_path, min_n, max_n, path_file=True)
for words in ngram_of_text_file:
fuzzy_set.add(words)
print(datetime.now() - st)
return fuzzy_set
def search_in_fuzzy_set(fuzzy_set: fuzzyset.FuzzySet, phrase: str) -> list:
return fuzzy_set.get(phrase)
def search_in_fuzzy_set_for_df(fuzzy_set: fuzzyset.FuzzySet, transcription_df: pd.DataFrame) -> pd.DataFrame:
"""
This function receives fuzzy set and trnascription dataframe and
and returns new data frame with optional matches (fragment_id, transcription, clean_trans,
image_URL, match, context, match_source_title,
match_grade)
"""
pass
def quick_fuzzy_set():
# TODO: https://towardsdatascience.com/fuzzy-matching-at-scale-84f2bfd0c536
pass