-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
150 lines (126 loc) · 4.64 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
from scipy.spatial.distance import euclidean as ec_dist
import math
# from timeit import default_timer as timer
"""kmeans.py: implements kmeans class"""
__author__ = "Nikhil Tumkur Ramesh"
__copyright__ = "Copyright 2018, IIT Mandi"
def mag2(X):
""" mag2: Returns the sum of squares of a vector.
To be read as "Magnitude Square"
"""
return np.sum(X**2, dtype=np.float64)
class kmeans():
'''
Attributes:
n_clusters: stores the number of clusters to be made.
default value: 2
max_iter: stores the upper limit for EM iteration.
default value: 300
means: stores the mean vectors.
clusters: stores the current allocation of points.
old_clusters: stores the previous allocation of points. It is used in the
stopping criteria.
'''
def __init__(self, n_clusters=2, max_iter=300):
self.n_clusters = n_clusters
self.max_iter = max_iter
def initialize_clusters(self, X):
self.clusters=np.array([-1]*X.shape[0])
self.old_clusters=np.array([-2]*X.shape[0])
def initialize_means(self, X, n_clusters, debug_mode=False):
self.means=X[0:n_clusters,:].astype(dtype=np.float64)
self.initialize_clusters(X)
# Perform check to ensure the cluster means are different. Otherwise you
# will run into all kinds of errors later.
last_index=n_clusters-1
all_different=False
while not all_different:
# if debug_mode:
# print("Initializing because not all points maybe different")
all_different=True
for idx, mean in enumerate(self.means):
for i in range(idx):
if np.array_equal(self.means[i], mean):
self.means[idx]=X[last_index+1,:].astype(dtype=np.float64)
last_index+=1
all_different=False
if debug_mode:
print("added:", self.means[idx])
break
self.old_means=np.zeros_like(self.means, dtype=np.float64)
# print('means are:')
# print(self.means)
# if debug_mode:
# print("Now verified that all means are initialized to different values")
''' Method: fit(input, number_of_clusters)
The main method used for performing the kmeans algorithm. it is first
initialized and then the EM loop is run untill convergence criteria is
reached.
'''
def fit(self, X, n_clusters, display_progress=False, debug_mode=False):
# start = timer()
self.n_clusters = n_clusters
self.initialize_means(X, n_clusters, debug_mode)
counter=0
dist=0.0
for idx in range(self.means.shape[0]):
dist+=ec_dist(self.means[idx], self.old_means[idx])
print("K=", self.n_clusters, ":Iteration #", counter, ":", dist, "distance shifted")
while (dist>1e-3) and counter < self.max_iter:
self.assign_clusters(X)
self.update_means(X)
counter += 1
if display_progress:
dist=0.0
for idx in range(self.means.shape[0]):
dist+=ec_dist(self.means[idx], self.old_means[idx])
print("K=", self.n_clusters, ":Iteration #", counter, ":", dist, "distance shifted")
self.calculate_cov(X)
''' Method: assign_clusters(input)
A helper method for allocating the points to clusters. Other required
data being used are the attributes means.
'''
def assign_clusters(self, X):
self.transfer_clusters()
for i in range(X.shape[0]):
self.distances=np.array([], dtype=np.float64)
for mean in self.means:
self.distances=np.append(self.distances,math.sqrt(mag2(X[i]-mean)))
self.clusters[i] = self.distances.argmin()
def update_means(self, X):
self.old_means = np.copy(self.means)
for i in range(self.means.shape[0]):
points=[]
for idx, point in enumerate(X):
if self.clusters[idx]==i:
points.append(point)
if len(points) == 0:
print("encountered zero list of points for i=", i)
input()
self.means[i]=np.mean(points,axis=0, dtype=np.float64)
def transfer_clusters(self):
self.old_clusters = np.copy(self.clusters)
def cluster_means(self):
return self.means
def gen_list_of_points(self):
self.list_of_points=[[] for i in range(self.n_clusters)]
for idx, i in enumerate(self.clusters):
self.list_of_points[i].append(idx)
def calculate_cov(self, X):
self.covariances=np.ones_like(self.means, dtype=np.float64)
self.list_of_points=[[] for i in range(self.n_clusters)]
for idx, i in enumerate(self.clusters):
self.list_of_points[i].append(idx)
for idx, l in enumerate(self.list_of_points):
if len(l)<2:
print("the ", idx, "cluster has ", len(l), " point(s). skip? Y/N")
c=input(">>")
if(not c=="N"):
print("cov retained as ones.")
continue
self.covariances[idx]=np.var(X[l,:], axis=0)
'''
saved code:
not np.array_equal(self.clusters, self.old_clusters)
'''