Skip to content
This repository was archived by the owner on Aug 26, 2022. It is now read-only.

Commit 4d15428

Browse files
committed
modified: plot.py
1 parent e51f667 commit 4d15428

20 files changed

+119
-34
lines changed

__pycache__/cluster.cpython-36.pyc

78 Bytes
Binary file not shown.
64 Bytes
Binary file not shown.

cluster.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def locate_center(self, judge, maxid, threshold):
2323
# result showed in rank.png
2424
# 6 clusters should be divided in given dataset
2525

26-
cluster_centers = list(c[0] for c in result[0:5])
26+
cluster_centers = list(c[0] for c in result[0:3])
2727
# given dataset: [1061, 1515, 400, 6, 1566, 614]
28-
# generate dataset: [80, 460, 463, 500, 954, 984]
28+
# generate dataset: [642, 877, 123]
2929

3030
tag_info = dict()
3131
cluster_id = 1
@@ -47,8 +47,12 @@ def classify(self, taginfo, srt_dens, min_num, maxid):
4747
for ele in srt_dens:
4848
dens_dict[ele[0]] = ele[1]
4949
for i in dens_dict.keys():
50-
if taginfo[i] == -1:
51-
taginfo[i] = taginfo[min_num[i]]
50+
try:
51+
if taginfo[i] == -1:
52+
taginfo[i] = taginfo[min_num[i]]
53+
except KeyError:
54+
raise 'Key error: key does not exist!'
55+
5256
return taginfo
5357

5458
def analysis(self, centers, taginfo, distance, maxid):

data_process.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import math
21
import numpy as np
32
import matplotlib.pyplot as plt
43

@@ -33,39 +32,51 @@ def entropy(self, distance, maxid, factor):
3332
for i in range(1, maxid + 1):
3433
tmp = 0
3534
for j in range(1, maxid + 1):
36-
tmp += math.exp(-pow(distance[(i, j)] / factor, 2))
35+
tmp += np.exp(-pow(distance[(i, j)] / factor, 2))
3736
potential[i] = tmp
3837
z = sum(potential.values())
3938
H = 0
4039
for i in range(1, maxid + 1):
4140
x = potential[i] / z
42-
H += x * math.log(x)
41+
H += x * np.log(x)
4342
return -H
4443

4544
def threshold(self, dist, max_id):
4645
'''
4746
:rtype: factor value makes H smallest
4847
'''
4948
entro = 10.0
49+
# given data:
5050
# 0.02139999999999999 7.203581306901208
5151
# 0.02149999999999999 7.203577254067677
5252
# 0.02159999999999999 7.203577734107922
53-
scape = np.arange(0.021+1e-4, 0.022, 1e-4)
53+
54+
# generate data:
55+
# 0.367020, 6.943842
56+
# 0.368959, 6.943840
57+
# 0.370898, 6.943841
58+
59+
scape = np.linspace(0.330, 0.430, 50)
60+
# 通用数据使用以下一行
61+
# scape = np.linspace(0.001, 1.001, 100)
5462
for factor in scape:
5563
value = self.entropy(dist, max_id, factor)
56-
# print(factor, value)
64+
print('factor: {0:.6f}, entropy: {1:.8f}'.format(factor, value))
5765
# plt.scatter(factor, value, c='r', s=1)
5866
if value and value < entro:
5967
entro, thresh = value, factor
6068
thresh = 3 * thresh / pow(2, 0.5)
69+
6170
"""
6271
plt.xlabel(r'$\sigma$')
6372
plt.ylabel(r'H')
64-
plt.savefig('./images/Entropy.png')
73+
plt.savefig('./images/Entropy test.png')
6574
plt.close()
6675
"""
67-
# print('current: ', entro, thresh)
68-
# current: 7.203577254067677 0.04560838738653229
76+
77+
print('current: ', entro, thresh)
78+
# given data: 7.203577254067677 0.04560838738653229
79+
# generate data: 6.943840312796875 0.7828967189629044
6980
return thresh
7081

7182
def CutOff(self, distance, max_id, threshold):
@@ -90,7 +101,7 @@ def Guasse(self, distance, max_id, threshold):
90101
for i in range(1, max_id + 1):
91102
tmp = 0
92103
for j in range(1, max_id + 1):
93-
tmp += math.exp(-pow((distance[(i, j)] / threshold), 2))
104+
tmp += np.exp(-pow((distance[(i, j)] / threshold), 2))
94105
guasse[i] = tmp
95106
sorted_guasse = sorted(guasse.items(), key=lambda k:k[1], reverse=True)
96107
return sorted_guasse

generatePoints.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
r = np.random.RandomState(24)
88
p = r.randn(400, 2)
99
q = r.randn(400, 2) + 7
10-
r = r.randn(400, 2) + 4
10+
s = r.randn(400, 2) + 4
1111

12-
t = np.concatenate((p, q, r), axis=0)
12+
t = np.concatenate((p, q, s), axis=0)
1313

1414
with open(GENERATE_POINTS, 'w', encoding='utf-8') as f:
1515
for pos in range(len(t)):
@@ -18,7 +18,7 @@
1818

1919
d = lambda x, y: np.sqrt(np.power((x[0] - y[0]), 2) + np.power((x[1] - y[1]), 2))
2020

21-
with open(GENERATE_POINTS_DIST, 'a', encoding='utf-8') as f:
21+
with open(GENERATE_POINTS_DIST, 'w', encoding='utf-8') as f:
2222
for i in range(len(t)):
2323
for j in range(i + 1, len(t)):
2424
distance = d(t[i], t[j])
@@ -29,8 +29,8 @@
2929
plt.plot(x, y, 'or', markersize=1, alpha=0.5, label='1')
3030
# plt.show()
3131

32-
x = r[:, 0]
33-
y = r[:, 1]
32+
x = s[:, 0]
33+
y = s[:, 1]
3434
plt.plot(x, y, 'ob', markersize=1, alpha=0.5, label='2')
3535

3636
x = q[:, 0]

images/Cluster1 test.png

11.2 KB
Loading

images/Cluster2 test.png

15.6 KB
Loading

images/Cluster3 test.png

19.4 KB
Loading

images/Cluster4 test.png

-25 KB
Binary file not shown.

images/Cluster5 test.png

-47.4 KB
Binary file not shown.

images/Decision Graph Cutoff test.png

1.55 KB
Loading

images/Entropy test.png

13.2 KB
Loading

images/cluster_cutoff_test.png

-1.1 KB
Loading

images/generatedPoints.png

25.1 KB
Loading

images/rank cutoff test.png

3.15 KB
Loading

images/result.png

7.09 KB
Loading

others/report.html

Lines changed: 24 additions & 0 deletions
Large diffs are not rendered by default.

others/report.md

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414

1515
- **Cut-off Kernel**
1616

17-
给定截断距离$d_{c} > 0$,采用Cut-off kernel方式计算局部密度,由$\rho_{i}=\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为连续值。
17+
给定截断距离$d_{c} > 0$,采用Cut-off kernel方式计算局部密度,由$\rho_{i}=\large\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为连续值。
1818

1919
- **Gaussian kernel**
2020

21-
给定截断距离$d_{c} > 0$,采用Gaussian kernel方式计算局部密度,由$\rho_{i}=\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为离散值。
21+
给定截断距离$d_{c} > 0$,采用Gaussian kernel方式计算局部密度,由$\rho_{i}=\large\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为离散值。
2222

2323
#### 2.1.2 最小距离$\delta_i$
2424

@@ -46,11 +46,11 @@
4646

4747
#### 2.2.1 Potential Of Point(POP)
4848

49-
对一个数据集$\{x_1,x_2,...,x_n\}$,每个点的potential计算公式为$\varphi(x)=\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$,类似Gaussian kernel的计算,其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离,$\sigma$为需要确定的变量值。
49+
对一个数据集$\{x_1,x_2,...,x_n\}$,每个点的potential计算公式为$\varphi(x)=\large\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$,类似Gaussian kernel的计算,其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离,$\sigma$为需要确定的变量值。
5050

5151
#### 2.2.2 Entropy
5252

53-
对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$,定义数据域的熵值$H=-\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$,熵值代表数据域的混乱度,我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势:
53+
对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$,定义数据域的熵值$H=-\large\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$,熵值代表数据域的混乱度,我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势:
5454

5555
![entropy](../images/entropy.png)
5656

@@ -118,9 +118,53 @@ def classify(self, taginfo, srt_dens, min_num, maxid):
118118

119119
由之前的实验结果可知聚类中心共6个,简单对六个簇的分类情况进行的可视化,横坐标为点标号,纵坐标为点到聚类中心的距离。由于点的个数较多,故采用面积图,如上图所示是第六个簇的效果图。
120120

121+
### 2.4 聚类测试
122+
123+
#### 2.4.1 测试数据
124+
125+
编写`generatePoints.py`来生成三个簇,每个簇400个点且均服从高斯分布,分布图如下所示。
126+
127+
![generatedPoints](../images/generatedPoints.png)
128+
129+
#### 2.4.2 聚类效果
130+
131+
通过求熵值来确定截断距离最佳取值的图如下:
132+
133+
![Entropy test](../images/Entropy test.png)
134+
135+
由画出决策图如下:
136+
137+
![Decision Graph Cutoff test](../images/Decision Graph Cutoff test.png)
138+
139+
定义$\gamma_i=\rho_i\delta_i$为聚类中心的划分标准,画出图像如下:
140+
141+
![rank cutoff test](../images/rank cutoff test.png)
142+
143+
截断距离选择`0.7828`为最佳值,由图能直观看出此时应该划分三个类,和生成三个簇的数据基本相符。
144+
145+
对三个簇进行可视化,画出相应的结果如下图,黑色加粗点为聚类中心:
146+
147+
![result](../images/result.png)
148+
149+
#### 2.4.3 结果分析
150+
151+
聚类结果与生成图对比发现有的边缘点被忽略了,生成每个聚类簇元素视图如下:
152+
153+
![cluster_cutoff_test](../images/cluster_cutoff_test.png)
154+
155+
1. 可见有一部分点被分到第-1个簇中,这是在非聚类中心点分类过程中一些距离三个聚类中心都很远的离群点,因此在可视化过程中由聚类中心生成对应的簇时,这些点会被忽略,从而导致聚类结果图中点的缺失。
156+
157+
**对这些离群点进行有效的信息处理和聚类划分,可以是对该算法优化的下一步工作。**
158+
159+
2. 对一些交错点划分,可见该算法性能较为朴素,在处理维度过高或者密度过大的点时可能任意出现交错点的错误划分。
160+
161+
**对交错点进行有效的处理可以有效解决这个问题,同时可以提升该算法的健壮性。**
162+
121163
## 3. 总结
122164

123-
由于对距离定义未知,所以没有进行六类cluster的plot。文章中提到的聚类算法其实只实现了聚类中心的选择,在这基础上阅读了文章的增补内容,进行了聚类过程算法的补全,同时对截断距离的选取进行优化。在这基础之上还可以对聚类边界进行讨论,对离群点和交叉点进行划分。
165+
由于对距离定义未知,所以没有对初始数据进行六类cluster的plot,只在测试数据集上进行了相关的聚类可视化处理。文章中提到的聚类算法其实只实现了聚类中心的选择,在这基础上阅读了文章的增补内容,进行了聚类过程算法的补全,同时对截断距离的选取进行优化。
166+
167+
在这基础之上还可以对聚类边界进行讨论,对离群点和交叉点进行划分。
124168

125169
对聚类算法的聚类中心选择一直是个研究热点,该算法很朴素但切中要点,能很好地解决聚类中心问题,但是在聚类中心个数的选择上和k-means算法一样,还是需要人为选择,联系对局部密度算法的优化,猜测是否可以对每个点进行熵值计算,寻找聚类中心熵值的特性,从而实现聚类中心个数的自动选择。
126170

plot.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
def main():
1010
solution = data_process.ProcessData()
1111
dist, maxid = solution.data_process(TEST_DATA)
12-
threshold = solution.threshold(dist, maxid)
12+
# 通用数据使用以下一行求截断距离(耗时较长)
13+
# threshold = solution.threshold(dist, maxid)
14+
threshold = 0.7828967189629044
1315
sort_dst = solution.CutOff(dist, maxid, threshold)
1416
# sort_dst = solution.Guasse(dist, maxid, threshold)
1517
min_dist, min_num = solution.min_distance(dist, sort_dst, maxid)
@@ -37,35 +39,35 @@ def main():
3739
p, x, y = int(p), float(x), float(y)
3840
coords[p] = [x, y]
3941
# print(coords[center[0]])
40-
for i in range(len(center) - 1):
42+
for i in range(len(center)):
4143
c = coords[center[i]]
4244
plt.plot(c[0], c[1], 'ok', markersize=5, alpha=0.8)
4345

44-
color = {0:'k', 1:'b', 2:'g', 3:'r', 4:'c', 5:'m', 6:'y'}
46+
color = {0:'r', 1:'b', 2:'g', 3:'k', 4:'c', 5:'m', 6:'y'}
4547
for p in temp:
4648
for i in range(len(center)):
49+
c = coords[p[0]]
4750
try:
48-
c = coords[p[0]]
49-
if p[1] == i:
51+
# 标号从1开始,故i+1
52+
if p[1] == i + 1:
5053
plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
5154
except KeyError:
52-
continue
53-
# plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
55+
raise 'Key map not exis!'
56+
5457
plt.xlabel('x')
5558
plt.ylabel('y')
5659
plt.title('Plot Result')
5760
plt.savefig('./images/result.png')
58-
plt.show()
61+
# plt.show()
62+
plt.close()
5963

60-
"""
6164
y, x = zip(*temp)
6265
plt.scatter(x, y)
6366
plt.xlabel('Cluster Number')
6467
plt.ylabel('Point Number')
6568
plt.title(r'$d_c=$' + str(threshold))
6669
plt.savefig('./images/cluster_cutoff_test.png')
67-
plt.show()
68-
"""
70+
# plt.show()
6971

7072
if __name__ == '__main__':
7173
main()

report.pdf

179 KB
Binary file not shown.

0 commit comments

Comments
 (0)