forked from r4f4/mc906
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkmeans.py
100 lines (80 loc) · 3.14 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import random
def choose_initial(data, k, distfunc=None):
"""
Choose randomly k different centroids.
:data The elements being clustered
:k The number of clusters
:distfunc Ignored
"""
return random.sample(data, k)
def choose_initial_pp(data, k, distfunc):
"""
Choose randomly k different centroids using the kmeans++ heuristic
by David Arthur and Sergei Vassilvitskii (see the article "k-means++:
The Advantages of Careful Seeding".
:data The elements being clustered
:k The number of clusters
:distfunc Function to calculate the distance between two elements.
"""
from bisect import bisect
from numpy import add
# Calculate squared distance
distance2 = lambda c, x: distfunc(c, x)**2
# The first centroid is a random one
centroids = [random.choice(data)]
tries = 0
while len(centroids) < k and tries < (k * 5):
mindists = [min((distance2(c, x), x) for c in centroids)
for x in data if x not in centroids]
# Divide because we add it twice: first for (c, x) and then for (x, c)
totaldist = float(sum(d for d, _ in mindists))
probs = [(d / totaldist) for d, _ in mindists]
pos = bisect(add.accumulate(probs), random.random())
centroids.append(mindists[pos - 1][1])
tries += 1
if len(centroids) < k:
centroids.extend(random.sample(data, k - len(centroids)))
return centroids
class Kmeans(object):
"""
k-means algorithm
"""
def __init__(self, data, k, distfunc=None, centroidfunc=None,
chooser=choose_initial, tol=0.0005):
"""
The K-means algorithm
:data A data set
:k The number of clusters
:distfunc A function to calculate the distance between two elements
from data.
:centroidfunc A function to calculate the centroid of the cluster. It
will receive a cluster.
:chooser A function to choose the initial k centroids. It must receive
the data set, the number of clusters k and the distfunc.
:tol Error tolerance
"""
assert distfunc is not None, "distfunc can't be None"
assert centroidfunc is not None, "centroidfunc can't be None"
assert data is not None, "data can't be None"
assert k > 0, "number of clusters can't be 0 or negative"
self.centroids = chooser(data, k, distfunc)
err = -1
niter = 1
while True:
bins = [set() for _ in xrange(k)]
#print 'Iteration #%d' % niter
niter += 1
for i in data:
ml = [(distfunc(c, i), ic) for ic, c in enumerate(self.centroids)]
ml_min, _ = min(ml)
c = random.choice([j for m, j in ml if m == ml_min])
bins[c].add(i)
for bi, b in enumerate(bins):
self.centroids[bi] = centroidfunc(b)
olderr = err
err = sum([sum([distfunc(d, self.centroids[c]) for d in b])
for c, b in enumerate(bins)])
if err - olderr < tol:
break
def result(self):
return self.centroids