From 839f96ed3dd231f3da410489e6c30f309f81affd Mon Sep 17 00:00:00 2001 From: Thomas Wood Date: Fri, 6 Sep 2024 14:02:37 +0100 Subject: [PATCH 1/3] Allow user to add/remove custom drugs at run time #6 --- src/drug_named_entity_recognition/__init__.py | 3 +- .../drugs_finder.py | 37 ++++++++++++++ tests/test_custom_modifications.py | 51 +++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 tests/test_custom_modifications.py diff --git a/src/drug_named_entity_recognition/__init__.py b/src/drug_named_entity_recognition/__init__.py index dd2a8ed..33ec671 100644 --- a/src/drug_named_entity_recognition/__init__.py +++ b/src/drug_named_entity_recognition/__init__.py @@ -29,5 +29,4 @@ __version__ = "2.0.0" - -from drug_named_entity_recognition.drugs_finder import find_drugs +from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug diff --git a/src/drug_named_entity_recognition/drugs_finder.py b/src/drug_named_entity_recognition/drugs_finder.py index f1e8ca9..cb97f54 100644 --- a/src/drug_named_entity_recognition/drugs_finder.py +++ b/src/drug_named_entity_recognition/drugs_finder.py @@ -75,6 +75,43 @@ def get_ngrams(text): ngram_to_variant[ngram].append(drug_variant) +def add_custom_drug_synonym(drug_variant: str, canonical_name: str, optional_variant_data: dict = None): + drug_variant = drug_variant.lower() + canonical_name = canonical_name.lower() + drug_variant_to_canonical[drug_variant] = [canonical_name] + if optional_variant_data is not None and len(optional_variant_data) > 0: + drug_variant_to_variant_data[drug_variant] = optional_variant_data + + ngrams = get_ngrams(drug_variant) + variant_to_ngrams[drug_variant] = ngrams + for ngram in ngrams: + if ngram not in ngram_to_variant: + ngram_to_variant[ngram] = [] + ngram_to_variant[ngram].append(drug_variant) + + return f"Added {drug_variant} as a synonym for {canonical_name}. Optional data attached to this synonym = {optional_variant_data}" + + +def add_custom_new_drug(drug_name, drug_data): + drug_name = drug_name.lower() + drug_canonical_to_data[drug_name] = drug_data + add_custom_drug_synonym(drug_name, drug_name) + + return f"Added {drug_name} to the tool with data {drug_data}" + + +def remove_drug_synonym(drug_variant: str): + drug_variant = drug_variant.lower() + ngrams = get_ngrams(drug_variant) + + del variant_to_ngrams[drug_variant] + + for ngram in ngrams: + ngram_to_variant[ngram].remove(drug_variant) + + return f"Added {drug_variant} from dictionary" + + def get_fuzzy_match(surface_form: str): query_ngrams = get_ngrams(surface_form) candidate_to_num_matching_ngrams = Counter() diff --git a/tests/test_custom_modifications.py b/tests/test_custom_modifications.py new file mode 100644 index 0000000..a3b4374 --- /dev/null +++ b/tests/test_custom_modifications.py @@ -0,0 +1,51 @@ +''' +MIT License + +Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com) + +Maintainer: Thomas Wood + +Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/ + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +''' + +import unittest + +from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug + + +class TestDrugsFinderModifications(unittest.TestCase): + + def test_drug_synonym_not_working_first(self): + drugs = find_drugs("i bought some potato".split(" ")) + + print (drugs) + + self.assertEqual(0, len(drugs)) + + + def test_drug_synonym_1(self): + add_custom_drug_synonym("potato", "sertraline") + + drugs = find_drugs("i bought some potato".split(" ")) + + self.assertEqual(1, len(drugs)) + self.assertEqual("Sertraline", drugs[0][0]['name']) From 6c6f12b259ae925bdc5cc8c85790992242a9b3e2 Mon Sep 17 00:00:00 2001 From: Thomas Wood Date: Fri, 6 Sep 2024 15:18:23 +0100 Subject: [PATCH 2/3] Make less tolerant of spelling errors --- src/drug_named_entity_recognition/__init__.py | 3 +- .../drugs_finder.py | 68 ++++++++++++------- tests/test_custom_modifications.py | 32 ++++++++- tests/test_drugs_finder.py | 8 +++ 4 files changed, 83 insertions(+), 28 deletions(-) diff --git a/src/drug_named_entity_recognition/__init__.py b/src/drug_named_entity_recognition/__init__.py index 33ec671..1374a7f 100644 --- a/src/drug_named_entity_recognition/__init__.py +++ b/src/drug_named_entity_recognition/__init__.py @@ -29,4 +29,5 @@ __version__ = "2.0.0" -from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug +from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug, \ + reset_drugs_data, remove_drug_synonym diff --git a/src/drug_named_entity_recognition/drugs_finder.py b/src/drug_named_entity_recognition/drugs_finder.py index cb97f54..b4e9551 100644 --- a/src/drug_named_entity_recognition/drugs_finder.py +++ b/src/drug_named_entity_recognition/drugs_finder.py @@ -39,21 +39,14 @@ this_path = pathlib.Path(__file__).parent.resolve() -# Load dictionary from disk - with bz2.open(this_path.joinpath("drug_ner_dictionary.pkl.bz2"), "rb") as f: d = pkl.load(f) -drug_variant_to_canonical = d["drug_variant_to_canonical"] -drug_canonical_to_data = d["drug_canonical_to_data"] -drug_variant_to_variant_data = d["drug_variant_to_variant_data"] - -for variant, canonicals in drug_variant_to_canonical.items(): - for canonical in canonicals: - if canonical in drug_canonical_to_data: - if "synonyms" not in drug_canonical_to_data[canonical]: - drug_canonical_to_data[canonical]["synonyms"] = [] - drug_canonical_to_data[canonical]["synonyms"].append(variant) +drug_variant_to_canonical = {} +drug_canonical_to_data = {} +drug_variant_to_variant_data = {} +ngram_to_variant = {} +variant_to_ngrams = {} def get_ngrams(text): @@ -64,15 +57,32 @@ def get_ngrams(text): return ngrams -ngram_to_variant = {} -variant_to_ngrams = {} -for drug_variant in drug_variant_to_canonical: - ngrams = get_ngrams(drug_variant) - variant_to_ngrams[drug_variant] = ngrams - for ngram in ngrams: - if ngram not in ngram_to_variant: - ngram_to_variant[ngram] = [] - ngram_to_variant[ngram].append(drug_variant) +# Load dictionary from disk +def reset_drugs_data(): + drug_variant_to_canonical.clear() + drug_canonical_to_data.clear() + drug_variant_to_variant_data.clear() + ngram_to_variant.clear() + variant_to_ngrams.clear() + + drug_variant_to_canonical.update(d["drug_variant_to_canonical"]) + drug_canonical_to_data.update(d["drug_canonical_to_data"]) + drug_variant_to_variant_data.update(d["drug_variant_to_variant_data"]) + + for variant, canonicals in drug_variant_to_canonical.items(): + for canonical in canonicals: + if canonical in drug_canonical_to_data: + if "synonyms" not in drug_canonical_to_data[canonical]: + drug_canonical_to_data[canonical]["synonyms"] = [] + drug_canonical_to_data[canonical]["synonyms"].append(variant) + + for drug_variant in drug_variant_to_canonical: + ngrams = get_ngrams(drug_variant) + variant_to_ngrams[drug_variant] = ngrams + for ngram in ngrams: + if ngram not in ngram_to_variant: + ngram_to_variant[ngram] = [] + ngram_to_variant[ngram].append(drug_variant) def add_custom_drug_synonym(drug_variant: str, canonical_name: str, optional_variant_data: dict = None): @@ -105,11 +115,13 @@ def remove_drug_synonym(drug_variant: str): ngrams = get_ngrams(drug_variant) del variant_to_ngrams[drug_variant] + del drug_variant_to_canonical[drug_variant] + del drug_variant_to_variant_data[drug_variant] for ngram in ngrams: ngram_to_variant[ngram].remove(drug_variant) - return f"Added {drug_variant} from dictionary" + return f"Removed {drug_variant} from dictionary" def get_fuzzy_match(surface_form: str): @@ -122,16 +134,21 @@ def get_fuzzy_match(surface_form: str): candidate_to_jaccard = {} for candidate, num_matching_ngrams in candidate_to_num_matching_ngrams.items(): - ngrams_in_query_and_candidate = ngrams.union(variant_to_ngrams[candidate]) + ngrams_in_query_and_candidate = query_ngrams.union(variant_to_ngrams[candidate]) jaccard = num_matching_ngrams / len(ngrams_in_query_and_candidate) candidate_to_jaccard[candidate] = jaccard + query_length = len(surface_form) if len(candidate_to_num_matching_ngrams) > 0: top_candidate = max(candidate_to_jaccard, key=candidate_to_jaccard.get) jaccard = candidate_to_jaccard[top_candidate] query_ngrams_missing_in_candidate = query_ngrams.difference(variant_to_ngrams[top_candidate]) candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(query_ngrams) - if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3: + + candidate_length = len(top_candidate) + length_diff = abs(query_length - candidate_length) + if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3 \ + and length_diff <= 2: return top_candidate, jaccard return None, None @@ -226,3 +243,6 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu match_data["structure_mol"] = structure return drug_matches + + +reset_drugs_data() diff --git a/tests/test_custom_modifications.py b/tests/test_custom_modifications.py index a3b4374..ee2f747 100644 --- a/tests/test_custom_modifications.py +++ b/tests/test_custom_modifications.py @@ -29,23 +29,49 @@ import unittest -from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug +from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, reset_drugs_data, \ + add_custom_new_drug, remove_drug_synonym class TestDrugsFinderModifications(unittest.TestCase): def test_drug_synonym_not_working_first(self): + reset_drugs_data() drugs = find_drugs("i bought some potato".split(" ")) - print (drugs) + print(drugs) self.assertEqual(0, len(drugs)) - def test_drug_synonym_1(self): + reset_drugs_data() add_custom_drug_synonym("potato", "sertraline") drugs = find_drugs("i bought some potato".split(" ")) self.assertEqual(1, len(drugs)) self.assertEqual("Sertraline", drugs[0][0]['name']) + + def test_completely_new_drug(self): + reset_drugs_data() + add_custom_new_drug("potato", {"name": "solanum tuberosum"}) + + drugs = find_drugs("i bought some potato".split(" ")) + + self.assertEqual(1, len(drugs)) + self.assertEqual("solanum tuberosum", drugs[0][0]['name']) + + def test_drug_synonym_working_control(self): + reset_drugs_data() + drugs = find_drugs("i bought some Sertraline".split(" ")) + + self.assertEqual(1, len(drugs)) + self.assertEqual("Sertraline", drugs[0][0]['name']) + + def test_drug_synonym_absent_after_erasure(self): + reset_drugs_data() + remove_drug_synonym("sertraline") + + drugs = find_drugs("i bought some Sertraline".split(" ")) + + self.assertEqual(0, len(drugs)) diff --git a/tests/test_drugs_finder.py b/tests/test_drugs_finder.py index 289863d..13d87f0 100644 --- a/tests/test_drugs_finder.py +++ b/tests/test_drugs_finder.py @@ -157,3 +157,11 @@ def test_dry_ice(self): drugs = find_drugs("i bought some dry ice".split(" "), is_include_structure=True) self.assertEqual(0, len(drugs)) + + def test_mounjaro_misspelt(self): + drugs = find_drugs("i bought some Monjaro".split(" "), is_include_structure=True, is_fuzzy_match=True) + + import json + print(json.dumps(drugs, indent=4)) + + self.assertEqual(1, len(drugs)) From d36aefab1f92476f629f9921ca5e0c3e76310ab9 Mon Sep 17 00:00:00 2001 From: Thomas Wood Date: Fri, 6 Sep 2024 15:26:14 +0100 Subject: [PATCH 3/3] Refine fuzzy matching --- .../drugs_finder.py | 44 +- src/drug_named_entity_recognition/util.py | 795 ++++++++++++++++++ 2 files changed, 819 insertions(+), 20 deletions(-) create mode 100644 src/drug_named_entity_recognition/util.py diff --git a/src/drug_named_entity_recognition/drugs_finder.py b/src/drug_named_entity_recognition/drugs_finder.py index b4e9551..dafd201 100644 --- a/src/drug_named_entity_recognition/drugs_finder.py +++ b/src/drug_named_entity_recognition/drugs_finder.py @@ -34,6 +34,7 @@ from collections import Counter from drug_named_entity_recognition.structure_file_downloader import download_structures +from drug_named_entity_recognition.util import stopwords dbid_to_mol_lookup = {} @@ -189,7 +190,8 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu # Search for 2 token sequences for token_idx, token in enumerate(tokens[:-1]): - cand = token + " " + tokens[token_idx + 1] + next_token = tokens[token_idx + 1] + cand = token + " " + next_token cand_norm = cand.lower() match = drug_variant_to_canonical.get(cand_norm, None) @@ -203,16 +205,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu is_exclude.add(token_idx) is_exclude.add(token_idx + 1) elif is_fuzzy_match: - fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm) - if fuzzy_matched_variant is not None: - match = drug_variant_to_canonical[fuzzy_matched_variant] - for m in match: - match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get( - fuzzy_matched_variant, {}) - match_data["match_type"] = "fuzzy" - match_data["match_similarity"] = similarity - - drug_matches.append((match_data, token_idx, token_idx + 1)) + if token.lower() not in stopwords and next_token.lower() not in stopwords: + fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm) + if fuzzy_matched_variant is not None: + match = drug_variant_to_canonical[fuzzy_matched_variant] + for m in match: + match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get( + fuzzy_matched_variant, {}) + match_data["match_type"] = "fuzzy" + match_data["match_similarity"] = similarity + + drug_matches.append((match_data, token_idx, token_idx + 1)) for token_idx, token in enumerate(tokens): if token_idx in is_exclude: @@ -224,15 +227,16 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get(cand_norm, {}) drug_matches.append((match_data, token_idx, token_idx)) elif is_fuzzy_match: - fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm) - if fuzzy_matched_variant is not None: - match = drug_variant_to_canonical[fuzzy_matched_variant] - for m in match: - match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get( - fuzzy_matched_variant, {}) - match_data["match_type"] = "fuzzy" - match_data["match_similarity"] = similarity - drug_matches.append((match_data, token_idx, token_idx + 1)) + if cand_norm not in stopwords and len(cand_norm) > 3: + fuzzy_matched_variant, similarity = get_fuzzy_match(cand_norm) + if fuzzy_matched_variant is not None: + match = drug_variant_to_canonical[fuzzy_matched_variant] + for m in match: + match_data = dict(drug_canonical_to_data[m]) | drug_variant_to_variant_data.get( + fuzzy_matched_variant, {}) + match_data["match_type"] = "fuzzy" + match_data["match_similarity"] = similarity + drug_matches.append((match_data, token_idx, token_idx + 1)) if is_include_structure: for match in drug_matches: diff --git a/src/drug_named_entity_recognition/util.py b/src/drug_named_entity_recognition/util.py new file mode 100644 index 0000000..454b43c --- /dev/null +++ b/src/drug_named_entity_recognition/util.py @@ -0,0 +1,795 @@ +stopwords = {'abbott', + 'abello', + 'about', + 'above', + 'across', + 'actelion', + 'aesica', + 'afghanistan', + 'africa', + 'african', + 'after', + 'afterwards', + 'again', + 'against', + 'aient', + 'aies', + 'albania', + 'alcon', + 'algeria', + 'allergan', + 'almost', + 'almus', + 'alone', + 'along', + 'alpharma', + 'already', + 'also', + 'altana', + 'although', + 'always', + 'american', + 'amgen', + 'among', + 'amongst', + 'amount', + 'andorra', + 'angola', + 'anguilla', + 'another', + 'antarctica', + 'antigua', + 'anyhow', + 'anyone', + 'anything', + 'anyway', + 'anywhere', + 'april', + 'arab', + 'arabia', + 'aren', + 'argentina', + 'armenia', + 'around', + 'aruba', + 'ascension', + 'assertio', + 'astrazeneca', + 'august', + 'aura', + 'aurai', + 'auraient', + 'aurais', + 'aurait', + 'auras', + 'aurez', + 'auriez', + 'aurions', + 'aurons', + 'auront', + 'australia', + 'austria', + 'avaient', + 'avais', + 'avait', + 'avec', + 'aventis', + 'avez', + 'aviez', + 'avions', + 'avons', + 'ayant', + 'ayante', + 'ayantes', + 'ayants', + 'ayez', + 'ayons', + 'azerbaijan', + 'back', + 'bahamas', + 'bahrain', + 'bangladesh', + 'barbados', + 'barbuda', + 'bausch', + 'baxter', + 'bayer', + 'became', + 'because', + 'become', + 'becomes', + 'becoming', + 'becton', + 'been', + 'before', + 'beforehand', + 'behind', + 'beiersdorf', + 'being', + 'belarus', + 'belgium', + 'belize', + 'below', + 'benckiser', + 'benin', + 'berk', + 'bermuda', + 'beside', + 'besides', + 'between', + 'beyond', + 'bhutan', + 'biogen', + 'bioscience', + 'bissau', + 'boehringer', + 'bolivarian', + 'bolivia', + 'bonaire', + 'boots', + 'bosnia', + 'both', + 'botswana', + 'bottom', + 'bouvet', + 'braun', + 'brazil', + 'bristol', + 'british', + 'brunei', + 'bulgaria', + 'burkina', + 'burundi', + 'cabo', + 'caicos', + 'caledonia', + 'call', + 'cambodia', + 'cameroon', + 'canada', + 'cannot', + 'cayman', + 'celltech', + 'central', + 'cephalon', + 'chad', + 'chemidex', + 'chiesi', + 'chile', + 'china', + 'christmas', + 'chugai', + 'cilag', + 'city', + 'clinical', + 'cocos', + 'colgate', + 'colombia', + 'coloplast', + 'comoros', + 'congo', + 'consumer', + 'convatec', + 'cook', + 'copyright', + 'costa', + 'could', + 'couldn', + 'croatia', + 'crookes', + 'cuba', + 'cunha', + 'cyprus', + 'czechia', + 'dans', + 'darussalam', + 'date', + 'davis', + 'december', + 'democratic', + 'denmark', + 'dentsply', + 'diagnostics', + 'dickinson', + 'didn', + 'dista', + 'djibouti', + 'does', + 'doesn', + 'doing', + 'dominica', + 'dominican', + 'done', + 'down', + 'dupont', + 'during', + 'dutch', + 'each', + 'ecuador', + 'egypt', + 'eight', + 'eisai', + 'either', + 'eleven', + 'elle', + 'else', + 'elsewhere', + 'emirates', + 'empty', + 'enough', + 'equatorial', + 'eritrea', + 'estonia', + 'eswatini', + 'ethicon', + 'ethiopia', + 'eudract', + 'eues', + 'eurent', + 'eusse', + 'eussent', + 'eusses', + 'eussiez', + 'eussions', + 'eustatius', + 'even', + 'ever', + 'every', + 'everyone', + 'everything', + 'everywhere', + 'except', + 'fabre', + 'falkland', + 'faroe', + 'faso', + 'february', + 'federated', + 'federation', + 'ferring', + 'fifteen', + 'fifty', + 'fiji', + 'finland', + 'first', + 'five', + 'florizel', + 'former', + 'formerly', + 'forty', + 'four', + 'france', + 'french', + 'fresenius', + 'friday', + 'from', + 'front', + 'full', + 'furent', + 'further', + 'fusse', + 'fussent', + 'fusses', + 'fussiez', + 'fussions', + 'futuna', + 'gabon', + 'galderma', + 'galpharm', + 'gambia', + 'gamble', + 'garnier', + 'gate', + 'georgia', + 'germany', + 'ghana', + 'gibraltar', + 'gilead', + 'give', + 'glaxosmithkline', + 'gotten', + 'greece', + 'greenland', + 'grenada', + 'grenadines', + 'grifols', + 'guadeloupe', + 'guam', + 'guatemala', + 'guernsey', + 'guiana', + 'guinea', + 'guyana', + 'hadn', + 'haiti', + 'hakko', + 'hasn', + 'have', + 'haven', + 'having', + 'health', + 'healthcare', + 'heard', + 'heinz', + 'helena', + 'hence', + 'here', + 'hereafter', + 'hereby', + 'herein', + 'hereupon', + 'hers', + 'herself', + 'herzegovina', + 'hillcross', + 'himself', + 'hoechst', + 'holy', + 'honduras', + 'hong', + 'however', + 'http', + 'https', + 'hundred', + 'hungary', + 'iceland', + 'indeed', + 'india', + 'indian', + 'indonesia', + 'ingelheim', + 'into', + 'invicta', + 'ipsen', + 'iran', + 'iraq', + 'ireland', + 'islamic', + 'island', + 'islands', + 'isle', + 'israel', + 'italy', + 'itself', + 'ivax', + 'jamaica', + 'janssen', + 'january', + 'japan', + 'jersey', + 'jordan', + 'july', + 'june', + 'just', + 'kazakhstan', + 'keeling', + 'keep', + 'kenya', + 'king', + 'kingdom', + 'kiribati', + 'kitts', + 'kong', + 'korea', + 'kuwait', + 'kyowa', + 'kyrgyzstan', + 'lambert', + 'lanka', + 'last', + 'latter', + 'latterly', + 'latvia', + 'least', + 'lebanon', + 'lederie', + 'leone', + 'lesotho', + 'less', + 'leste', + 'leur', + 'liberia', + 'libya', + 'liechtenstein', + 'lifescan', + 'lilly', + 'lithuania', + 'lomb', + 'lucia', + 'lundbeck', + 'luxembourg', + 'maarten', + 'macao', + 'macedonia', + 'madagascar', + 'made', + 'mais', + 'make', + 'malawi', + 'malaysia', + 'maldives', + 'mali', + 'malta', + 'malvinas', + 'many', + 'march', + 'mariana', + 'marino', + 'marion', + 'marshall', + 'martin', + 'martindale', + 'martinique', + 'mauritania', + 'mauritius', + 'mayen', + 'mayne', + 'mayotte', + 'mcdonald', + 'mcneil', + 'meanwhile', + 'meda', + 'medac', + 'medical', + 'medisense', + 'menarini', + 'merck', + 'mexico', + 'micronesia', + 'might', + 'mightn', + 'milupa', + 'mine', + 'minor', + 'miquelon', + 'moldova', + 'monaco', + 'monday', + 'mongolia', + 'montenegro', + 'montserrat', + 'more', + 'moreover', + 'morocco', + 'most', + 'mostly', + 'move', + 'mozambique', + 'much', + 'must', + 'mustn', + 'myanmar', + 'myers', + 'myself', + 'name', + 'namely', + 'namibia', + 'nauru', + 'needn', + 'neither', + 'nepal', + 'netherlands', + 'neutrogena', + 'never', + 'nevertheless', + 'nevis', + 'next', + 'nicaragua', + 'niger', + 'nigeria', + 'nine', + 'niue', + 'nobody', + 'none', + 'noone', + 'nordisk', + 'norfolk', + 'north', + 'northern', + 'norway', + 'nothing', + 'notre', + 'nous', + 'novartis', + 'november', + 'novo', + 'nowhere', + 'nutrition', + 'nycomed', + 'oasteur', + 'ocean', + 'octapharma', + 'october', + 'often', + 'oman', + 'once', + 'only', + 'onto', + 'orion', + 'other', + 'others', + 'otherwise', + 'otsuka', + 'ours', + 'ourselves', + 'outlying', + 'over', + 'page', + 'pakistan', + 'palau', + 'palestine', + 'palmolive', + 'panama', + 'papua', + 'paraguay', + 'parke', + 'part', + 'path', + 'people', + 'perhaps', + 'peru', + 'pfizer', + 'pharm', + 'pharma', + 'pharmaceuticals', + 'pharmacia', + 'philippines', + 'pierre', + 'pitcairn', + 'please', + 'plough', + 'plurinational', + 'poland', + 'polynesia', + 'portugal', + 'poulenc', + 'pour', + 'principe', + 'procter', + 'products', + 'proprietary', + 'province', + 'pubmed', + 'puerto', + 'qatar', + 'quite', + 'rather', + 'really', + 'reckitt', + 'regarding', + 'reproduction', + 'republic', + 'reserved', + 'revision', + 'rica', + 'rico', + 'roche', + 'romania', + 'rosemont', + 'ross', + 'roussel', + 'russian', + 'rwanda', + 'rybar', + 'saba', + 'sahara', + 'saint', + 'salts', + 'salvador', + 'same', + 'samoa', + 'sandoz', + 'sandwich', + 'sankyo', + 'sanofi', + 'saturday', + 'saudi', + 'schering', + 'schwarz', + 'searle', + 'seem', + 'seemed', + 'seeming', + 'seems', + 'senegal', + 'september', + 'sera', + 'serai', + 'seraient', + 'serais', + 'serait', + 'seras', + 'serbia', + 'serez', + 'seriez', + 'serions', + 'serious', + 'serono', + 'serons', + 'seront', + 'servier', + 'several', + 'seychelles', + 'shan', + 'shire', + 'should', + 'shouldn', + 'show', + 'side', + 'sierra', + 'sigma', + 'since', + 'singapore', + 'sint', + 'sixty', + 'slovakia', + 'slovenia', + 'snbts', + 'soient', + 'sois', + 'soit', + 'solomon', + 'solvay', + 'somalia', + 'some', + 'somehow', + 'someone', + 'something', + 'sometime', + 'sometimes', + 'somewhere', + 'sommes', + 'sont', + 'south', + 'southern', + 'soyez', + 'soyons', + 'spain', + 'squibb', + 'state', + 'states', + 'stiefel', + 'still', + 'strain', + 'strains', + 'strictly', + 'such', + 'sudan', + 'suis', + 'sunday', + 'suriname', + 'svalbard', + 'sweden', + 'switzerland', + 'syrian', + 'taiwan', + 'tajikistan', + 'take', + 'takeda', + 'tanzania', + 'taro', + 'territories', + 'territory', + 'teva', + 'thailand', + 'than', + 'that', + 'their', + 'theirs', + 'them', + 'themselves', + 'then', + 'thence', + 'there', + 'thereafter', + 'thereby', + 'therefore', + 'therein', + 'thereupon', + 'these', + 'they', + 'third', + 'this', + 'thornton', + 'those', + 'though', + 'three', + 'through', + 'throughout', + 'thru', + 'thursday', + 'thus', + 'timor', + 'title', + 'tobago', + 'together', + 'togo', + 'tokelau', + 'tome', + 'tonga', + 'toward', + 'towards', + 'trinidad', + 'trinity', + 'tristan', + 'tuesday', + 'tunisia', + 'turkey', + 'turkmenistan', + 'turks', + 'tuvalu', + 'twelve', + 'twenty', + 'tyco', + 'uganda', + 'ukraine', + 'unauthorised', + 'unauthorized', + 'under', + 'united', + 'univar', + 'unless', + 'until', + 'upon', + 'uruguay', + 'used', + 'using', + 'uzbekistan', + 'valeant', + 'vanuatu', + 'various', + 'vatican', + 'venezuela', + 'verde', + 'very', + 'viatris', + 'viet', + 'vincent', + 'virgin', + 'votre', + 'vous', + 'wallis', + 'warner', + 'wasn', + 'wednesday', + 'well', + 'were', + 'weren', + 'western', + 'what', + 'whatever', + 'when', + 'whence', + 'whenever', + 'where', + 'whereafter', + 'whereas', + 'whereby', + 'wherein', + 'whereupon', + 'wherever', + 'whether', + 'which', + 'while', + 'whither', + 'whoever', + 'whole', + 'whom', + 'whose', + 'will', + 'with', + 'within', + 'without', + 'wockhardt', + 'would', + 'wouldn', + 'wyeth', + 'yamanouchi', + 'yemen', + 'your', + 'yours', + 'yourself', + 'yourselves', + 'zambia', + 'zealand', + 'zimbabwe'}