-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauto_nmf.py
197 lines (159 loc) · 8.14 KB
/
auto_nmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Haoyu Chao ([email protected])
'''
===================
一、脚本使用方法
===================
1. 先使用pancancer账户切换到gpuserver节点:
ssh gpuserver
2. 再使用绝对路径调用conda激活环境:
source /public/workspace202011/pancancer/chaohy/software/miniconda3/bin/activate gpu
3. 调用脚本处理demo数据
python /public/workspace202011/pancancer/chaohy/scripts/auto_nmf.py \
-i /public/workspace202011/pancancer/chaohy/NMF_count/demo/demo.count.txt \
-W /public/workspace202011/pancancer/chaohy/NMF_count/demo/demo.w.txt \
-H /public/workspace202011/pancancer/chaohy/NMF_count/demo/demo.h.txt \
--rank_min 1 --rank_max 30 -n 3000
4. 查看脚本帮助文档
python /public/workspace202011/pancancer/chaohy/scripts/auto_nmf.py -h
===================
二、其他有帮助的命令
===================
1. 查看GPU使用情况, 每秒刷新一次. ctrl+c退出
watch -n 1 nvidia-smi
2. 想回到node5节点, 需要先切换到server, 再切到node5。:)
ssh server
ssh node5
'''
print('Loading modules...')
import argparse
import torch
import pandas as pd
import matplotlib.pyplot as plt
# 定义非负矩阵分解函数
def nmf(data, rank, num_iterations, tol=1e-5, seed=520):
torch.manual_seed(seed) # set random seed
# Convert data to tensor
data_tensor = torch.tensor(data.values, dtype=torch.float32).cuda() # fp32
# Random initialization of factor matrices
m, n = data_tensor.shape
W = torch.abs(torch.randn((m, rank), device='cuda'))
H = torch.abs(torch.randn((rank, n), device='cuda'))
# print('W shape: {}, H shape: {}'.format(W.shape, H.shape))
# Iterative update of factor matrices
prev_recon_error = float('inf') # Set previous reconstruction error to infinity
for i in range(num_iterations):
# Update H
numerator = torch.matmul(W.t(), data_tensor)
denominator = torch.matmul(torch.matmul(W.t(), W), H)
denominator = torch.where(denominator < 1.3e-7, torch.tensor(1.3e-7, device='cuda'), denominator) # Protect the denominator
H *= numerator / denominator
# Update W
numerator = torch.matmul(data_tensor, H.t())
denominator = torch.matmul(torch.matmul(W, H), H.t())
denominator = torch.where(denominator < 1.3e-7, torch.tensor(1.3e-7, device='cuda'), denominator) # Protect the denominator
W *= numerator / denominator
# Calculate reconstruction error and check for convergence
if i % 10 == 0:
recon_error = torch.norm(data_tensor - torch.matmul(W, H), p='fro')
rel_change = torch.abs(recon_error - prev_recon_error) / prev_recon_error
prev_recon_error = recon_error
# print('iterations: {}, Reconstruction error: {:.2f}, Relative change: {:.2e}'.format(i, recon_error, rel_change))
if rel_change <= tol:
print('Converged after {} iterations, Recon_error: {:.2f}, Tol: {:.2e}'.format(i, recon_error, rel_change))
break
if i == num_iterations - 1:
print('Reached maximum number of iterations, Recon_error: {:.2f}, Tol: {:.2e}'.format(recon_error, rel_change))
# Convert results to pandas DataFrame and add row and column names
W_df = pd.DataFrame(W.cpu().numpy(), index=data.index)
H_df = pd.DataFrame(H.cpu().numpy(), columns=data.columns)
return W_df, H_df, recon_error
def find_optimal_rank(data, rank_min, rank_max, num_iterations):
best_rank = None
similarity_values = []
recon_error_values = []
score_values = []
rank_values = range(rank_min, rank_max + 1)
for rank in range(rank_min, rank_max + 1):
print('Evaluating rank: {}'.format(rank))
W, H, recon_error = nmf(data, rank, num_iterations)
recon_error_values.append(recon_error.item())
# Convert data to tensor
data_tensor = torch.tensor(data.values, dtype=torch.float32).cuda() # fp32
W = torch.tensor(W.values, dtype=torch.float32).cuda() # fp32
H = torch.tensor(H.values, dtype=torch.float32).cuda() # fp32
# Calculate similarity between data and reconstruction
similarity = torch.cosine_similarity(data_tensor, torch.matmul(W, H), dim=1) # 按细胞所在的行计算余弦相似度
similarity_avg = torch.mean(similarity) # 使用平均值作为衡量相似性的指标
similarity_values.append(similarity_avg.item())
print('Similarity: {}'.format(similarity_avg))
# 将重构误差归一化到0-1之间
scaled_recon_error_values = scale_values(recon_error_values)
# Calculate the combined score as weighted average
alpha = 0.3 # 重构误差的权重
beta = 0.7 # 相似性的权重
score_values = [alpha * (1 - x) + beta * y for x, y in zip(scaled_recon_error_values, similarity_values)]
for i in range(len(score_values)):
if i > 0:
if abs(score_values[i] - score_values[i-1]) < 0.001:
best_rank = i + rank_min
best_similarity = similarity_values[i]
break
if best_rank is None:
best_rank = score_values.index(max(score_values))
best_similarity = similarity_values[best_rank]
# 保存结果图像为PDF
plot_results(rank_values, similarity_values, scaled_recon_error_values, score_values, 'result_plot.pdf', best_rank)
return best_rank, best_similarity
def plot_results(rank_values, similarity_values, scaled_recon_error_values, score_values, output_file, best_rank):
fig, ax1 = plt.subplots(figsize=(len(rank_values) * 0.4, 4))
color = 'tab:green'
ax1.set_xlabel('Rank')
ax1.set_ylabel('Similarity', color=color)
ax1.plot(rank_values, similarity_values, color=color, marker='o')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('Error', color=color)
ax2.plot(rank_values, scaled_recon_error_values, color=color, marker='o')
ax2.tick_params(axis='y', labelcolor=color)
color='tab:red'
ax3 = ax1.twinx() # instantiate a third axes that shares the same x-axis
ax3.spines['right'].set_position(('outward', 60)) # move the third axis to the right
ax3.set_ylabel('Score', color=color)
ax3.plot(rank_values, score_values, color=color, marker='o')
ax3.tick_params(axis='y', labelcolor=color) # Set label color to red
plt.xticks(range(min(rank_values), max(rank_values) + 1), range(min(rank_values), max(rank_values) + 1))
plt.axvline(x=best_rank, color='purple', linestyle='--')
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.savefig(output_file, format='pdf')
plt.close()
def scale_values(values):
min_val = min(values)
max_val = max(values)
scaled_values = [(val - min_val) / (max_val - min_val) for val in values]
return scaled_values
if __name__ == '__main__':
# 定义参数
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, required=True, help='输入文件路径')
parser.add_argument('-W', '--output_w', type=str, required=True, help='输出W的文件路径')
parser.add_argument('-H', '--output_h', type=str, required=True, help='输出H的文件路径')
parser.add_argument('--rank_min', type=int, default=2, help='矩阵分解的最小秩')
parser.add_argument('--rank_max', type=int, default=30, help='矩阵分解的最大秩')
parser.add_argument('-n', '--num_iterations', type=int, default=3000, help='迭代次数')
args = parser.parse_args()
# 读取数据
print('Reading data...')
data = pd.read_csv(args.input, sep='\t', index_col=0)
# 寻找最优的rank值
best_rank, best_similarity = find_optimal_rank(data, args.rank_min, args.rank_max, args.num_iterations)
# 进行非负矩阵分解
print('=====================================================')
print('Performing NMF with Best rank: {}, Best similarity: {}'.format(best_rank, best_similarity))
W, H, _ = nmf(data, best_rank, args.num_iterations)
# 保存结果
print('Saving results...')
W.to_csv(args.output_w, sep='\t')
H.to_csv(args.output_h, sep='\t')