Skip to content

Commit

Permalink
Merge pull request #7 from fastdatascience/allow_custom_vocab
Browse files Browse the repository at this point in the history
Allow custom vocab
  • Loading branch information
woodthom2 authored Sep 6, 2024
2 parents 901a8dd + d36aefa commit 06c02ce
Show file tree
Hide file tree
Showing 5 changed files with 980 additions and 39 deletions.
4 changes: 2 additions & 2 deletions src/drug_named_entity_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@

__version__ = "2.0.0"


from drug_named_entity_recognition.drugs_finder import find_drugs
from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug, \
reset_drugs_data, remove_drug_synonym
135 changes: 98 additions & 37 deletions src/drug_named_entity_recognition/drugs_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,20 @@
from collections import Counter

from drug_named_entity_recognition.structure_file_downloader import download_structures
from drug_named_entity_recognition.util import stopwords

dbid_to_mol_lookup = {}

this_path = pathlib.Path(__file__).parent.resolve()

# Load dictionary from disk

with bz2.open(this_path.joinpath("drug_ner_dictionary.pkl.bz2"), "rb") as f:
d = pkl.load(f)

drug_variant_to_canonical = d["drug_variant_to_canonical"]
drug_canonical_to_data = d["drug_canonical_to_data"]
drug_variant_to_variant_data = d["drug_variant_to_variant_data"]

for variant, canonicals in drug_variant_to_canonical.items():
for canonical in canonicals:
if canonical in drug_canonical_to_data:
if "synonyms" not in drug_canonical_to_data[canonical]:
drug_canonical_to_data[canonical]["synonyms"] = []
drug_canonical_to_data[canonical]["synonyms"].append(variant)
drug_variant_to_canonical = {}
drug_canonical_to_data = {}
drug_variant_to_variant_data = {}
ngram_to_variant = {}
variant_to_ngrams = {}


def get_ngrams(text):
Expand All @@ -64,16 +58,72 @@ def get_ngrams(text):
return ngrams


ngram_to_variant = {}
variant_to_ngrams = {}
for drug_variant in drug_variant_to_canonical:
# Load dictionary from disk
def reset_drugs_data():
drug_variant_to_canonical.clear()
drug_canonical_to_data.clear()
drug_variant_to_variant_data.clear()
ngram_to_variant.clear()
variant_to_ngrams.clear()

drug_variant_to_canonical.update(d["drug_variant_to_canonical"])
drug_canonical_to_data.update(d["drug_canonical_to_data"])
drug_variant_to_variant_data.update(d["drug_variant_to_variant_data"])

for variant, canonicals in drug_variant_to_canonical.items():
for canonical in canonicals:
if canonical in drug_canonical_to_data:
if "synonyms" not in drug_canonical_to_data[canonical]:
drug_canonical_to_data[canonical]["synonyms"] = []
drug_canonical_to_data[canonical]["synonyms"].append(variant)

for drug_variant in drug_variant_to_canonical:
ngrams = get_ngrams(drug_variant)
variant_to_ngrams[drug_variant] = ngrams
for ngram in ngrams:
if ngram not in ngram_to_variant:
ngram_to_variant[ngram] = []
ngram_to_variant[ngram].append(drug_variant)


def add_custom_drug_synonym(drug_variant: str, canonical_name: str, optional_variant_data: dict = None):
drug_variant = drug_variant.lower()
canonical_name = canonical_name.lower()
drug_variant_to_canonical[drug_variant] = [canonical_name]
if optional_variant_data is not None and len(optional_variant_data) > 0:
drug_variant_to_variant_data[drug_variant] = optional_variant_data

ngrams = get_ngrams(drug_variant)
variant_to_ngrams[drug_variant] = ngrams
for ngram in ngrams:
if ngram not in ngram_to_variant:
ngram_to_variant[ngram] = []
ngram_to_variant[ngram].append(drug_variant)

return f"Added {drug_variant} as a synonym for {canonical_name}. Optional data attached to this synonym = {optional_variant_data}"


def add_custom_new_drug(drug_name, drug_data):
drug_name = drug_name.lower()
drug_canonical_to_data[drug_name] = drug_data
add_custom_drug_synonym(drug_name, drug_name)

return f"Added {drug_name} to the tool with data {drug_data}"


def remove_drug_synonym(drug_variant: str):
drug_variant = drug_variant.lower()
ngrams = get_ngrams(drug_variant)

del variant_to_ngrams[drug_variant]
del drug_variant_to_canonical[drug_variant]
del drug_variant_to_variant_data[drug_variant]

for ngram in ngrams:
ngram_to_variant[ngram].remove(drug_variant)

return f"Removed {drug_variant} from dictionary"


def get_fuzzy_match(surface_form: str):
query_ngrams = get_ngrams(surface_form)
Expand All @@ -85,16 +135,21 @@ def get_fuzzy_match(surface_form: str):

candidate_to_jaccard = {}
for candidate, num_matching_ngrams in candidate_to_num_matching_ngrams.items():
ngrams_in_query_and_candidate = ngrams.union(variant_to_ngrams[candidate])
ngrams_in_query_and_candidate = query_ngrams.union(variant_to_ngrams[candidate])
jaccard = num_matching_ngrams / len(ngrams_in_query_and_candidate)
candidate_to_jaccard[candidate] = jaccard

query_length = len(surface_form)
if len(candidate_to_num_matching_ngrams) > 0:
top_candidate = max(candidate_to_jaccard, key=candidate_to_jaccard.get)
jaccard = candidate_to_jaccard[top_candidate]
query_ngrams_missing_in_candidate = query_ngrams.difference(variant_to_ngrams[top_candidate])
candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(query_ngrams)
if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3:

candidate_length = len(top_candidate)
length_diff = abs(query_length - candidate_length)
if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3 \
and length_diff <= 2:
return top_candidate, jaccard
return None, None

Expand Down Expand Up @@ -135,7 +190,8 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu

# Search for 2 token sequences
for token_idx, token in enumerate(tokens[:-1]):
cand = token + " " + tokens[token_idx + 1]
next_token = tokens[token_idx + 1]
cand = token + " " + next_token
cand_norm = cand.lower()

match = drug_variant_to_canonical.get(cand_norm, None)
Expand All @@ -149,16 +205,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
is_exclude.add(token_idx)
is_exclude.add(token_idx + 1)
elif is_fuzzy_match:
fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm)
if fuzzy_matched_variant is not None:
match = drug_variant_to_canonical[fuzzy_matched_variant]
for m in match:
match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(
fuzzy_matched_variant, {})
match_data["match_type"] = "fuzzy"
match_data["match_similarity"] = similarity

drug_matches.append((match_data, token_idx, token_idx + 1))
if token.lower() not in stopwords and next_token.lower() not in stopwords:
fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm)
if fuzzy_matched_variant is not None:
match = drug_variant_to_canonical[fuzzy_matched_variant]
for m in match:
match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(
fuzzy_matched_variant, {})
match_data["match_type"] = "fuzzy"
match_data["match_similarity"] = similarity

drug_matches.append((match_data, token_idx, token_idx + 1))

for token_idx, token in enumerate(tokens):
if token_idx in is_exclude:
Expand All @@ -170,15 +227,16 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(cand_norm, {})
drug_matches.append((match_data, token_idx, token_idx))
elif is_fuzzy_match:
fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm)
if fuzzy_matched_variant is not None:
match = drug_variant_to_canonical[fuzzy_matched_variant]
for m in match:
match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(
fuzzy_matched_variant, {})
match_data["match_type"] = "fuzzy"
match_data["match_similarity"] = similarity
drug_matches.append((match_data, token_idx, token_idx + 1))
if cand_norm not in stopwords and len(cand_norm) > 3:
fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm)
if fuzzy_matched_variant is not None:
match = drug_variant_to_canonical[fuzzy_matched_variant]
for m in match:
match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(
fuzzy_matched_variant, {})
match_data["match_type"] = "fuzzy"
match_data["match_similarity"] = similarity
drug_matches.append((match_data, token_idx, token_idx + 1))

if is_include_structure:
for match in drug_matches:
Expand All @@ -189,3 +247,6 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
match_data["structure_mol"] = structure

return drug_matches


reset_drugs_data()
Loading

0 comments on commit 06c02ce

Please sign in to comment.