Skip to content

Commit

Permalink
Removed support for French to better integrate with automatic updates
Browse files Browse the repository at this point in the history
  • Loading branch information
CryogenicallyPreservedWombat committed Aug 14, 2020
1 parent 3b0c78c commit e0eac18
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions topic_modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# Lemmatization
import spacy
import fr_core_news_sm
# import fr_core_news_sm

# Printing model topics
from pprint import pprint
Expand All @@ -39,6 +39,7 @@
# Utils
import numpy as np
import pandas as pd
from warnings import warn

# Preprocessing

Expand Down Expand Up @@ -100,8 +101,11 @@ def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB'], lang='english'):
Returns: a list of documents with words replaced by their lemmas and removed if they do not constitute a part of speech indicated in `allowed_postags`
"""

# French for Quebec
nlp = spacy.load('en', disable=['parser', 'ner']) if lang == 'english' else fr_core_news_sm.load(disable=['parser', 'ner'])
if lang != 'english':
warn('Support only currently exists for English language processing')
return None

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # if lang == 'english' else fr_core_news_sm.load(disable=['parser', 'ner'])
return [[token.lemma_ for token in nlp(" ".join(doc)) if token.pos_ in allowed_postags] for doc in texts]

def make_bigrams(texts, min_count=5, threshold=100):
Expand Down Expand Up @@ -473,4 +477,4 @@ def lda_from_province(province, doc_attrib='source_full_text', start_date=dateti
use_coherence=use_coherence,
random_state=random_state,
plot=plot,
verbose=verbose)
verbose=verbose)

0 comments on commit e0eac18

Please sign in to comment.