Removed support for French to better integrate with automatic updates

jajsmith · Aug 14, 2020 · e0eac18 · e0eac18
1 parent 3b0c78c
commit e0eac18
Showing 1 changed file with 8 additions and 4 deletions.
diff --git a/topic_modelling.py b/topic_modelling.py
@@ -18,7 +18,7 @@
 
 # Lemmatization
 import spacy
-import fr_core_news_sm
+# import fr_core_news_sm
 
 # Printing model topics
 from pprint import pprint
@@ -39,6 +39,7 @@
 # Utils
 import numpy as np
 import pandas as pd
+from warnings import warn
 
 # Preprocessing
 
@@ -100,8 +101,11 @@ def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB'], lang='english'):
     Returns: a list of documents with words replaced by their lemmas and removed if they do not constitute a part of speech indicated in `allowed_postags`
     """
 
-    # French for Quebec
-    nlp = spacy.load('en', disable=['parser', 'ner']) if lang == 'english' else fr_core_news_sm.load(disable=['parser', 'ner'])
+    if lang != 'english':
+        warn('Support only currently exists for English language processing')
+        return None
+
+    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # if lang == 'english' else fr_core_news_sm.load(disable=['parser', 'ner'])
     return [[token.lemma_ for token in nlp(" ".join(doc)) if token.pos_ in allowed_postags] for doc in texts]
 
 def make_bigrams(texts, min_count=5, threshold=100):
@@ -473,4 +477,4 @@ def lda_from_province(province, doc_attrib='source_full_text', start_date=dateti
         use_coherence=use_coherence,
         random_state=random_state,
         plot=plot,  
-        verbose=verbose)
+        verbose=verbose)