diff --git a/harvesting_data_from_source/05_combine_data_sources.py b/harvesting_data_from_source/05_combine_data_sources.py index 04db59f..059e51d 100644 --- a/harvesting_data_from_source/05_combine_data_sources.py +++ b/harvesting_data_from_source/05_combine_data_sources.py @@ -36,7 +36,7 @@ from nltk.corpus import words -from inclusions import common_english_words_to_include_in_drugs_dictionary, extra_terms_to_exclude_from_drugs_dictionary +from inclusions import common_english_words_to_include_in_drugs_dictionary, extra_terms_to_exclude_from_drugs_dictionary, extra_mappings re_num = re.compile(r'^\d+$') re_three_digits = re.compile(r'\d\d\d') @@ -197,6 +197,9 @@ def get_brand_names_nhs(description: str): for synonym in synonyms: add_synonym(synonym, canonical) +for surface_form, canonical_form in extra_mappings.items(): + add_synonym(surface_form, canonical_form) + # Remove common English words print("Finding all drugs that are also in the NLTK list of English words.") diff --git a/harvesting_data_from_source/inclusions.py b/harvesting_data_from_source/inclusions.py index 1749070..215fe20 100644 --- a/harvesting_data_from_source/inclusions.py +++ b/harvesting_data_from_source/inclusions.py @@ -347,3 +347,5 @@ "java tea", "kidney tea", } + +extra_mappings = {"mounjaro": "tirzepatide"} \ No newline at end of file diff --git a/src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2 b/src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2 index a6731f3..45cde81 100644 Binary files a/src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2 and b/src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2 differ diff --git a/tests/test_drugs_finder.py b/tests/test_drugs_finder.py index c21643b..0a16581 100644 --- a/tests/test_drugs_finder.py +++ b/tests/test_drugs_finder.py @@ -147,3 +147,8 @@ def test_penicillin_streptomycin(self): drugs = find_drugs("i bought some Penicillin streptomycin".split(" "), is_include_structure=True) self.assertEqual(2, len(drugs)) # should be 1? + + def test_mounjaro(self): + drugs = find_drugs("i bought some Mounjaro".split(" "), is_include_structure=True) + + self.assertEqual(1, len(drugs))