-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgraph_cluster.py
193 lines (172 loc) · 5.37 KB
/
graph_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import networkx as nx
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import normalize
# from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh
def get_overlap_clusters(G, k, I, cliques=None):
clusters = {}
id_c = 0;
for cluster in _CPM_cluster(G, k, I, cliques):
clusters[id_c] = list(cluster)
id_c += 1
return clusters
"""
Clusters G using the Clique Percolation Method
Parameters
----------
G : NetworkX graph
Input graph
k : int
Size of smallest clique
I : double
Intensity threshold for weighted graphs
Set I=0 for unweighted graphs
cliques: list or generator
Precomputed cliques (use networkx.find_cliques(G))
Returns
-------
Yields dictionnary of clusters.
"""
def _CPM_cluster(G, k, I, cliques=None):
"""
Clusters G using the Clique Percolation Method
Parameters
----------
G : NetworkX graph
Input graph
k : int
Size of smallest clique
I : double
Intensity threshold for weighted graphs
Set I=0 for unweighted graphs
cliques: list or generator
Precomputed cliques (use networkx.find_cliques(G))
Returns
-------
Yields sets of nodes, one for each k-clique community.
"""
if k < 2:
raise nx.NetworkXError("k=%d, k must be greater than 1."%k)
if cliques is None:
cliques = list(nx.find_cliques(G))
cliques = [frozenset(c) for c in cliques if (len(c) >= k and _intensity(c,G) > I)]
# First index which nodes are in which cliques
membership_dict = defaultdict(list)
for clique in cliques:
for node in clique:
membership_dict[node].append(clique)
# For each clique, see which adjacent cliques percolate
perc_graph = nx.Graph()
perc_graph.add_nodes_from(cliques)
for clique in cliques:
for adj_clique in _get_adjacent_cliques(clique, membership_dict):
if len(clique.intersection(adj_clique)) >= (k - 1):
perc_graph.add_edge(clique, adj_clique)
# Connected components of clique graph with perc edges
# are the percolated cliques
for component in nx.connected_components(perc_graph):
yield(frozenset.union(*component))
def _intensity(clique, G):
"""
For unweighted graphs, returns 1
"""
product = 1.0
k = 0
for edge in nx.subgraph(G,clique).edges():
product *= G.get_edge_data(edge[0],edge[1])['weight']
k += 1
if product == 0:
return 1
else:
return product**(1.0/k)
def _get_adjacent_cliques(clique, membership_dict):
adjacent_cliques = set()
for n in clique:
for adj_clique in membership_dict[n]:
if clique != adj_clique:
adjacent_cliques.add(adj_clique)
return adjacent_cliques
def pagerank_top_k(G, k = 10):
pr = nx.pagerank_scipy(G)
ky, v = pr.keys(), pr.values()
ix = np.argsort(v)[::-1]
return np.array(ky)[ix[:k]]
def SpectralEmbedding(G, k = 5):
'''
Takes a input Graph, and embeds it into k-dimensional
euclidean space using a top-k-eigenvector embedding.
'''
# creates row normalized weight matrix
M = normalize(nx.adj_matrix(G.copy()), norm='l1', axis=1)
_, v = eigsh(M, k = k, which = 'LM')
return v
def MCL_cluster(G,ex,r,tol,threshold):
"""
Computes a clustering of graph G using the MCL algorithm
with power parameter ex and inflation parameter r
The algorithm runs until the relative decrease in norm
is lower than tol or after 10,000 iterations
Returns an array whose values are greater than threshold
Leaves the graph G unchanged
"""
M = np.array(nx.adj_matrix(G.copy()))
M = inflate(M,1)
norm_old = 0
norm_new = np.linalg.norm(M)
it = -1
itermax = 10000
while it < itermax:
it += 1
norm_old = norm_new
M = M**ex
M = inflate(M,r)
norm_new = np.linalg.norm(M)
if __name__ == '__main__':
# debugging
print "iteration %s" %it
print "prop. decrease %s" %(abs(norm_old-norm_new)/norm_old)
if abs(norm_old-norm_new)/norm_old < tol:
print it
break
M[M < threshold] = 0
return M
def inflate(M,r):
"""
Returns a copy of the numpy array M inflated columnwise by a factor r
Rmk: r = 1 makes M stochastic
"""
col_sums = np.power(M,r).sum(axis=0)
mat = np.power(M,r) / col_sums[np.newaxis,:]
return mat
def add_self_loops(G):
"""
Adds self loops of weight 1 to the weighted graph G (in place)
for nodes that don't have one already
"""
for node in G.nodes():
if not G.has_edge(node, node):
G.add_edge(node, node, weight=1.)
if __name__ == '__main__':
"""
debugging code
"""
G = nx.Graph()
L = ['a','b','c','d']
G.add_nodes_from(L)
G.add_weighted_edges_from([('a','b',1),('c','b',2),('c','a',1),('d','a',1)])
# G = nx.fast_gnp_random_graph(40,.5)
part = get_overlap_clusters(G,2,0)
# part = _CPM_cluster(G,2,0)
# for p in part:
# print p
# mat = np.array(nx.adj_matrix(G))
# G = nx.fast_gnp_random_graph(40,.5)
# mat[0,0] = 10
# tol = 1e-5
# threshold = 1e-5
# ex = 2
# r = 2
# print mat
# m = MCL_cluster(G,ex,r,tol,threshold)
# print m