This repository was archived by the owner on Jul 10, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_pssm.py
101 lines (82 loc) · 2.78 KB
/
model_pssm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
import pickle
from sklearn import svm
from predictor_fasta import binary_rawdata
from predictor_fasta import data_window
from predictor_fasta import data_svm
path = os.getcwd()
###### Parsing single train sequence ######
def pssm_parser(seqfile,pssmfile,windowsize):
# Read a single sequence file
seqID1, seq1= [], []
with open(seqfile) as f:
data = f.read().splitlines()
for i in range(len(data)):
if i%3 == 1:
seq1.append(data[i])
if i%3 == 0:
seqID1.append(data[i])
seqData1 = {
"seqID":seqID1,
"seq":np.array(seq1),
}
seqData = DataFrame(seqData1)
for i in range(len(seqData.seq)):
a = list(seqData.seq[i])
seqData.seq[i]=a
# For each single sequence file, vercterize amino acid.
pssmcsv = pd.read_csv(pssmfile, header=None)
pssm = pssmcsv.transpose()
data=[]
col=pssm.shape[1]
for i in range(col):
data.append(pd.Series.tolist(pssm[i]))
#adding head and tails to build a window
seqFirst=data[0]
seqLast=data[-1]
halfwin = int((windowsize-1)/2)
for i in range(halfwin):
data.append(seqLast)
data.insert(0,seqFirst)
# Creating a slide window
seq= []
for i in range(len(data)-2*halfwin):
temp = []
for n in range(windowsize):
temp.extend(data[i+n])
#print(temp)
#print('\n')
seq.append(temp)
return seq
def mult_seq_pssm_parser(seqDirectory,pssmDirectory,windowsize):
dataSeq = []
for seqfilename in os.listdir(seqDirectory):
seqfilepath = os.path.join(seqDirectory, seqfilename)
pssmfilepath = os.path.join(pssmDirectory,seqfilename+'.pssm.csv')
dataSeq.extend(pssm_parser(seqfilepath,pssmfilepath,windowsize))
return dataSeq
if __name__ == "__main__":
windowsize = 15
print("Parsing data...")
dataBinary = binary_rawdata("data/trainset.dat")
print("Adding window...")
dataWind = data_window(windowsize,dataBinary)
print("SVM prediction preparing...")
dataSVM = data_svm(dataWind)
dataSeq = mult_seq_pssm_parser('pssm/Sequences',
'pssm/pssmMatrix',windowsize)
dataStruc = pd.Series.tolist(dataSVM.seqTopo)
print("Model building...")
clf = svm.LinearSVC(max_iter = 500, dual = False)
clf.fit(dataSeq,dataStruc)
print("Saving models...")
filepath = os.path.join('models', 'linsvm_pssm.pkl')
if not os.path.exists('models'):
os.makedirs('models')
with open(filepath,'wb')as f:
pickle.dump(clf,f)
print("Model Built!")
pass