forked from r4f4/mc906
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
132 lines (102 loc) · 3.48 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import zlib
import cPickle
from math import sqrt
from Stemmer import Stemmer
from numpy import dot, array
from cStringIO import StringIO
# Keep just one instance of this
stemmer = Stemmer('english')
class memoize:
def __init__(self, function):
self.function = function
self.memoized = {}
def __call__(self, *args):
try:
return self.memoized[args]
except KeyError:
self.memoized[args] = self.function(*args)
return self.memoized[args]
class memoize2:
def __init__(self, function):
self.function = function
self.memoized = {}
def __call__(self, *args):
try:
return self.memoized[args]
except KeyError:
self.memoized[args] = self.function(*args)
self.memoized[args[::-1]] = self.memoized[args]
return self.memoized[args]
@memoize
def stemWord(word):
return stemmer.stemWord(word)
def norm(vector):
return sqrt(dot(vector, vector.conj()))
def encode_array(a, factor=3):
"""
Serialize and compress a document so we can use less memory
"""
compressed_str = zlib.compress(a.tostring(), factor)
return compressed_str
def decode_array(compressed_str):
"""
Decompress and deserialize a document
"""
uncompressed_str = zlib.decompress(compressed_str)
return numpy.fromstring(uncompressed_str)
@memoize2
def distance(doc1, doc2):
""" Calculate the distance between doc1 and doc2. Assume the documents are
encoded (by encode_document).
The distance is calculated as a cosine measure:
dist = cos(d1, d2) = (d1 . d2) / ||d1|| ||d2||
"""
char_vec1 = decode_array(doc1.char_vector)
char_vec2 = decode_array(doc2.char_vector)
assert isinstance(doc1, type(doc2)), \
"objects type mismatch: %s and %s." % (type(doc1), type(doc2))
return 1 - dot(char_vector1, char_vector2) / \
(norm(char_vector1) * norm(char_vector2))
def calc_centroid(cluster):
"""
Calculate the centroid from the given cluster.
Note: This function assumes that the methods __add__ and __div__ are
correctly set on the cluster's elements.
"""
assert len(cluster) > 0
c0 = cluster.pop()
res = sum((d for d in cluster), c0) / \
(len(cluster) + 1)
cluster.add(c0)
return res
def normalize(doc, words, idf=True):
"""
Normalize the characteristic vector of Document doc.
:idf Whether use or not the inverse document frequence
Note:
be aware that parsing additional documents after calling this
method will result in different characterist vectors from before.
So parse all documents needed first.
"""
freq = doc.freq
# Each term is weightened by the inverse document frequency in the
# document collection
if idf is True:
darray = array(map(lambda w: float(freq.get(w, 0)) / words[w],
words.keys()))
else:
darray = array(map(lambda w: float(freq.get(w, 0)), words.keys()))
# Normalize the vector
darray /= norm(darray)
doc.char_vector = encode_array(darray)
def get_clusters(centroids, data):
"""
Get list containing sest with the data belonging to them.
:centroids A ordered set of centroids.
:data A set (not necesseraly ordered) of data.
"""
clusters = [set() for _ in xrange(len(centroids))]
for i in data:
_, j = min([(distance(c, i), j) for j, c in enumerate(centroids)])
clusters[j].add(i)
return clusters