-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathreader.py
120 lines (102 loc) · 3.85 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymongo
import logging
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO
_logger = logging.getLogger(__name__)
class Reader(object):
''' Source reader object feeds other objects to iterate through a source. '''
def __init__(self):
''' init '''
exclude_stops = set(('.', '(', ')'))
self.stop = set(stopwords.words('english')) - exclude_stops
self.wn_lemmatizer = WordNetLemmatizer()
def prepare_words(self, text):
''' Prepare text
'''
# lower cased all text
texts = text.lower()
# tokenize
texts = re.split(r'(?![\.|\$])[^\w\d]', texts)
texts = [w.strip('.') for w in texts]
# remove words that are too short
texts = [w for w in texts if not len(w)<3]
# remove words that are not alphanumeric and does not contain at least one character
texts = [w for w in texts if w.isalnum()]
# remove numbers only
texts = [w for w in texts if not w.isdigit()]
# remove stopped words
texts = [w for w in texts if not w in self.stop]
# remove duplicates
seen = set()
seen_add = seen.add
texts = [w for w in texts if not (w in seen or seen_add(w)) ]
# lemmatize
texts = [self.wn_lemmatizer.lemmatize(w) for w in texts]
return texts
def iterate(self):
''' virtual method '''
pass
class MongoReader(Reader):
def __init__(self, dbName=None, collName=None,
mongoURI="mongodb://localhost:27017", limit=None):
''' init
:param mongoURI: mongoDB URI. default: localhost:27017
:param dbName: MongoDB database name.
:param collName: MongoDB Collection name.
:param limit: query limit
'''
Reader.__init__(self)
self.conn = None
self.mongoURI = mongoURI
self.dbName = dbName
self.collName = collName
self.limit = limit
self.fields = ['Title', 'Plot', 'Actors', 'Genre']
self.key_field = 'Genre'
self.return_fields = ['Title', 'Plot', 'Actors']
def get_value(self, value):
''' convinient method to retrive value.
'''
if not value:
return value
if isinstance(value, list):
return ' '.join([v.encode('utf-8', 'replace').decode('utf-8', 'replace') for v in value])
else:
return value.encode('utf-8', 'replace').decode('utf-8', 'replace')
def iterate(self):
''' Iterate through the source reader '''
if not self.conn:
try:
self.conn = pymongo.MongoClient(self.mongoURI)[self.dbName][self.collName]
except Exception as ex:
raise Exception("ERROR establishing connection: %s" % ex)
if self.limit:
projection = {}
for key in self.fields:
projection[key] = 1
cursor = self.conn.aggregate([{"$sample":{"size":self.limit}},
{"$project":projection}])
else:
cursor = self.conn.find({}, self.fields)
for doc in cursor:
content = ""
for f in self.return_fields:
content +=" %s" % (self.get_value(doc.get(f)))
texts = self.prepare_words(content)
tags = doc.get(self.key_field).split(',')
tags = [t.strip() for t in tags]
doc = { "texts": texts, "tags": tags, "title": doc.get('Title'), "plot": doc.get('Plot')}
yield doc
if __name__ == "__main__":
pass