-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathFLAME.py
171 lines (156 loc) · 7.07 KB
/
FLAME.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
FLAME - Fuzzy clustering by Local Approximation of MEmbership
"""
from __future__ import print_function
import numpy as np
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_array
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import normalize
from math import sqrt
class FLAME(BaseEstimator, ClusterMixin):
def __init__(self, metric="euclidean", cluster_neighbors=5, iteration_neighbors=5, max_iter=np.inf, eps=1e-10, thd=-2, verbose=0):
self.metric = metric
self.cluster_neighbors = cluster_neighbors
self.iteration_neighbors = iteration_neighbors
self.max_iter = max_iter
self.eps = eps
self.thd = thd
self.verbose = verbose
def _get_nearest(self, distances, n_neighbors, n_samples):
# Make a numpy arange for iteration purposes.
sample_range = np.arange(n_samples)[:, None]
# Do an introsort on each row of the distances matrix to put the nth smallest distance in the nth position and all
# smaller elements before it. Then keep only the first n+1 elements (including the element itself which will have
# distance 0 from itself and is removed later).
nearest_np = np.argpartition(distances, n_neighbors, axis=1)
nearest_np = nearest_np[:, :n_neighbors + 1]
# Find the largest distance of the kth closest points.
largest_distance = distances[sample_range, nearest_np[sample_range, -1]]
# Make two arrays of sets the first containing only the n nearest other elements to each element not
# including the element itself and the second containing the same plus any other elements tied for nth nearest
# again excluding the element itself (though if there are k other elements all 0 distance away other problems
# will result).
nearest = []
nearest_with_ties = []
for i in range(n_samples):
ties_for_largest_distance = np.where(distances[i] == largest_distance[i])
nearest.append(set(nearest_np[i, :].tolist()))
nearest[-1].remove(i)
ties_for_largest_distance = set(ties_for_largest_distance[0].tolist())
ties_for_largest_distance.discard(i)
nearest_with_ties.append(nearest[i] | ties_for_largest_distance)
return nearest, nearest_with_ties
def _get_densities(self, distances, nearest, n_samples):
# Make a numpy arange for iteration purposes.
sample_range = np.arange(n_samples)[:, None]
nearest_np = np.array([list(s) for s in nearest])
n_shortest_distances = distances[sample_range, nearest_np]
local_distance_sums = n_shortest_distances.sum(axis=1)
largest_local_sum = local_distance_sums.max(axis=0)
densities = np.asarray(largest_local_sum / local_distance_sums)
return densities
def _get_supports(self, densities, nearest_with_ties, n_samples):
density_sum = densities.sum()
density_mean = density_sum / n_samples
density_sum2 = (densities * densities).sum()
thd = density_mean + self.thd * sqrt(density_sum2 / n_samples - density_mean * density_mean)
csos = []
outliers = []
remaining = []
for i in range(n_samples):
if densities[i] < thd:
outliers.append(i)
elif densities[i] > densities[list(nearest_with_ties[i])].max():
csos.append(i)
else:
remaining.append(i)
return csos, outliers, remaining
def _get_weights(self, distances, nearest_with_ties, fixed, n_samples):
nearest_with_ties = [sorted(list(s)) for s in nearest_with_ties]
weights = lil_matrix((n_samples, n_samples))
for i in range(n_samples):
if i in fixed:
weights[i, i] = 1
else:
for j in nearest_with_ties[i]:
weights[i, j] = distances[i, j]
if self.verbose: print("Assigned weights {0}.".format(i))
weights = weights.tocsr()
weights = normalize(weights, norm='l1', axis=1, copy=False)
return weights
def _get_starting_membership(self, csos, outliers, fixed, n_samples):
M = len(csos) + 1
starting_membership = np.zeros(shape=(n_samples, M))
general_row = np.ndarray(shape=(1, M))
general_row.fill(1. / M)
for i in range(n_samples):
if i not in fixed:
starting_membership[i, :] = general_row
for index, value in enumerate(csos):
starting_membership[value, index] = 1
for i in outliers:
starting_membership[i, -1] = 1
return starting_membership
def _flame(self, X):
"""
Pass Numpy or Pandas array of data as X. As metric pass any string as in sklearn.metrics.pairwise.pairwise_distances
or a callable on pairs of members of X. FLAME is computed with n_neighbors until max_iter or convergence up to eps.
thd is the threshold for outliers: Any element which has less than mean(density) + thd * std(density) will be an outlier.
"""
if sparse.issparse(X) and self.metric not in {"precomputed", "cityblock", "cosine", "euclidean", "l1", "l2",
"manhattan"} and not callable(self.metric):
raise TypeError("The metric {0} does not support sparse data.".format(self.metric))
# Convert pandas objects to numpy arrays.
if 'pandas' in str(X.__class__):
X = X.values
X = check_array(X, accept_sparse="csr", dtype=None)
# Get the number of samples. We use this a lot.
n_samples, _ = X.shape
distances = pairwise_distances(X, metric=self.metric)
nearest, nearest_with_ties = self._get_nearest(distances, self.cluster_neighbors, n_samples)
if self.verbose: print("Got distances and nearest.")
densities = self._get_densities(distances, nearest, n_samples)
if self.verbose: print("Got densities.")
csos, outliers, remaining = self._get_supports(densities, nearest_with_ties, n_samples)
if self.verbose: print("Got suppports.")
if self.verbose: print("There are {0} clusters and {1} outliers.".format(len(csos), len(outliers)))
fixed = set(csos) | set(outliers)
_, nearest_with_ties_for_iteration = self._get_nearest(distances, self.iteration_neighbors, n_samples)
weights = self._get_weights(distances, nearest_with_ties_for_iteration, fixed, n_samples)
if self.verbose: print("Got weights.")
membership_proba = self._get_starting_membership(csos, outliers, fixed, n_samples)
if self.verbose: print("Got starting memberships.")
i = 0
while i < self.max_iter:
lastMembership = membership_proba.copy()
membership_proba = weights.dot(membership_proba)
delta = np.absolute(membership_proba - lastMembership).max()
i += 1
if self.verbose: print("Done iteration {0}.".format(i))
if delta < self.eps:
break
num_clusters = membership_proba.shape[1] - 1
# Get cluster assignment.
pred = np.argmax(membership_proba, axis=1)
# Replace predictions of the outlier group with -1.
pred[pred == num_clusters] = -1
return membership_proba, pred, csos, outliers, densities
def fit(self, X):
self.membership_proba_, self.labels_, self.csos_, self.outliers_, self.densities_ = \
self._flame(X)
return self
def fit_predict(self, X, y=None):
y = self.fit(X).labels_
return y
def fit_predict_proba(self, X, y=None):
y = self.fit(X).membership_proba_
return y
if __name__== "__main__":
X = np.array(
[[0, 0, 0], [1.1, 0, 0], [0, 0.8, 0], [0, 0, 1.3], [10, 10, 10], [11.1, 10, 10], [10, 10.8, 10], [10, 11, 12]])
model = FLAME(cluster_neighbors=3, iteration_neighbors=3)
membership = model.fit_predict(X)
print(membership)