forked from r4f4/mc906
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse.py
172 lines (133 loc) · 4.72 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from re import split
from numpy import dot, array
from util import norm, stemWord
class Document(object):
"""
A document class
"""
def __init__(self, pathname, ignore_words=None, stem=True):
"""
:pathname The pathname of the document.
:ignore_words A set of words to ignore when parsing the document.
:stem The words will be stemmed if it is True.
"""
self._filename = pathname
self._char_vector = None
self._freq = dict()
self.stem = stem
self._ignore_words = ignore_words or set()
def __len__(self):
""" The number of words in the current document """
return len(self.freq)
def __add__(self, val):
docres = Document(None)
# Add words frequencies in both documents
if isinstance(val, type(self)):
docres.char_vector = encode_array(decode_array(self.char_vector) +
decode_array(val.char_vector))
# Increase each frequence by other in case other is a scalar
elif isinstance(val, int) or isinstance(val, float):
docres.char_vector = encode_array(decode_array(self.char_vector) +
decode_array(val))
return docres
#def __sub__(self, val):
#
# docres = Document(None)
# if isinstance(val, type(self)):
# docres.char_vector = self.char_vector + val.char_vector
# else:
# docres.char_vector = self.char_vector - val
# return docres
def __div__(self, val):
""" The division by a scalar or other document """
docres = Document(None)
docres.char_vector = encode_array(decode_array(self.char_vector) / val)
return docres
def read(self):
with open(self.filename, 'r') as f:
for l in f.readlines():
for w in split(r'[^a-z]+', l.strip().lower()):
word = unicode(w)
if not word is u'' and word not in self._ignore_words:
if self.stem is True:
word = stemWord(word)
self.freq[word] = self.freq.get(word, 0) + 1
del word
def words(self):
""" Get a list of words parsed from the current document """
return self.freq.keys()
def words_frequence(self):
""" Get an iterator of items in the format (word, frequence) from all
words parsed from the current document. """
return self.freq.iteritems()
@property
def freq(self):
""" Get a dictionary with frequencies indexed by the words """
return self._freq
@property
def char_vector(self):
""" Get the characteristic vector for current document """
return self._char_vector
@char_vector.setter
def char_vector(self, value):
"""
Set the characterist vector for current document.
The characteristic vector *must* be an instance of numpy.array or it
will raise TypeError in that case.
"""
self._char_vector = value
@property
def filename(self):
""" Get current document's pathname """
return self._filename
class Parser(object):
"""
A text file parser class.
"""
def __init__(self, wignore_file):
"""
:wignore_file Pathname of a file containing, on each line, a word to be
ignored when parsing.
"""
self._docset = None
self._ignored = None
self._words = None
with open(wignore_file, 'r') as f:
self._ignored = set([unicode(w.strip()) for w in f.readlines()])
def __len__(self):
""" Return number of documents already parsed """
return len(self._docset)
def parse(self, doclist, stem=True, verbose=False):
""" Parse a list of documents """
del self._words
del self._docset
self._docset = []
self._words = {}
for docname in doclist:
if verbose is True:
print 'Parsing %s' % docname
doc = Document(docname, self._ignored, stem)
doc.read()
# Add more words to the set of significant and distinct words
# and keep track of their counting
freq = doc.freq
map(lambda w: self._words.__setitem__(w,
self._words.get(w, 0) + freq[w]), doc.words())
self._docset.append(doc)
import gc
gc.collect()
@property
def docset(self):
"""
Get a set of Documents.
"""
return self._docset
@property
def words(self):
"""
Get overall word frequencies
"""
return self._words
@words.setter
def words(self, value):
self._words = value