diff --git a/newspaper/article.py b/newspaper/article.py index df0d9c435..789e8515d 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -12,6 +12,7 @@ import requests +from googletrans import Translator from . import images from . import network from . import nlp @@ -368,6 +369,12 @@ def is_media_news(self): if s in self.url: return True return False + + def translater(self,dest = 'en'): + self.dest = dest + translator = Translator() + result = translator.translate(self.text,self.dest) + return result.text def nlp(self): """Keyword extraction wrapper diff --git a/newspaper/resources/text/stopwords-mr.txt b/newspaper/resources/text/stopwords-mr.txt new file mode 100644 index 000000000..27949fafb --- /dev/null +++ b/newspaper/resources/text/stopwords-mr.txt @@ -0,0 +1,99 @@ +अधिक +अनेक +अशी +असलयाचे +असलेल्या +असा +असून +असे +आज +आणि +आता +आपल्या +आला +आली +आले +आहे +आहेत +एक +एका +कमी +करणयात +करून +का +काम +काय +काही +किवा +की +केला +केली +केले +कोटी +गेल्या +घेऊन +जात +झाला +झाली +झाले +झालेल्या +टा +डॉ +तर +तरी +तसेच +ता +ती +तीन +ते +तो +त्या +त्याचा +त्याची +त्याच्या +त्याना +त्यानी +त्यामुळे +त्री +दिली +दोन +न +नाही +निर्ण्य +पण +पम +परयतन +पाटील +म +मात्र +माहिती +मी +मुबी +म्हणजे +म्हणाले +म्हणून +या +याचा +याची +याच्या +याना +यानी +येणार +येत +येथील +येथे +लाख +व +व्यकत +सर्व +सागित्ले +सुरू +हजार +हा +ही +हे +होणार +होत +होता +होती +होते \ No newline at end of file diff --git a/newspaper/text.py b/newspaper/text.py index 23b4c6b1e..1046f46dd 100644 --- a/newspaper/text.py +++ b/newspaper/text.py @@ -182,6 +182,30 @@ def get_stopword_count(self, content): ws.set_stopword_count(len(overlapping_stopwords)) ws.set_stop_words(overlapping_stopwords) return ws + +class StopWordsMarathi(StopWords): + """Marathi segmentation + """ + def __init__(self, language='mr'): + super(StopWordsHindi, self).__init__(language='mr') + + def get_stopword_count(self, content): + if not content: + return WordStats() + ws = WordStats() + stripped_input = self.remove_punctuation(content) + candidate_words = self.candidate_words(stripped_input) + overlapping_stopwords = [] + c = 0 + for w in candidate_words: + c += 1 + for stop_word in self.STOP_WORDS: + overlapping_stopwords.append(stop_word) + + ws.set_word_count(c) + ws.set_stopword_count(len(overlapping_stopwords)) + ws.set_stop_words(overlapping_stopwords) + return ws class StopWordsJapanese(StopWords): diff --git a/newspaper/utils.py b/newspaper/utils.py index bfa441482..f0ede42cb 100644 --- a/newspaper/utils.py +++ b/newspaper/utils.py @@ -393,6 +393,7 @@ def print_available_languages(): 'uk': 'Ukrainian', 'vi': 'Vietnamese', 'zh': 'Chinese', + 'mr': 'Marathi' } codes = get_available_languages()