-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRNAcompete_LR_classifier.py
104 lines (86 loc) · 2.8 KB
/
RNAcompete_LR_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pickle
import numpy as np
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from skopt.space import Real
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
# Read in training data
trainfile = "training_set.txt"
trainset = np.loadtxt(trainfile, delimiter='\t', skiprows=1)
X = trainset[:, 1:len(trainset[0])]
Y = trainset[:, 0]
# Scale data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
# Read in feature names
afile = open(trainfile, 'r')
featurenames = afile.readline().strip().split('\t')[1:]
afile.close()
# Read in and scale testing data
testfile = "testing_set.txt"
testset = np.loadtxt(testfile, delimiter='\t', skiprows=1)
Xtest = testset[:, 1:len(testset[0])]
Ytest = testset[:, 0]
Xtest = scaler.transform(Xtest)
# Read in and scale ucRBP experiment data
ucrbpfile = "ucrbp_experiment_features.txt"
Xucrbp = np.loadtxt(ucrbpfile, delimiter='\t', skiprows=1)
Xucrbp = scaler.transform(Xucrbp)
# PREPARE TO RUN LOGISTIC REGRESSION
lr = LogisticRegression(solver='liblinear')
# DEFINE PARAMETER GRID
param_grid = {
'penalty': ['l1'],
'solver': ['liblinear'],
'C': Real(low=1e-6, high=100, prior='log-uniform'),
}
# SET UP OPTIMIZER
opt = BayesSearchCV(
lr,
param_grid,
n_iter=30,
random_state=1234,
verbose=0,
n_jobs = 1
)
opt.fit(X, Y)
# FIT PARAMETERS
lr = LogisticRegression(**opt.best_params_)
lr.fit(X, Y)
# Get probability estimates and AUROC for test set
test_predictions = lr.predict_proba(Xtest)[:,1]
test_auroc = roc_auc_score(Ytest, test_predictions)
# Write probability estimates and AUROC for test set
outfile = open('test_set_probability_estimates.txt', 'w')
outfile.write('probability.estimate\n')
for i in range(len(test_predictions)):
outfile.write(str(Ytest[i])+'\t'+str(test_predictions[i])+'\n')
outfile.close()
outfile = open('test_set_AUROC.txt', 'w')
outfile.write(str(test_auroc)+"\n")
outfile.close()
# Get probability estimates for ucrbp experiments
ucrbp_predictions = lr.predict_proba(Xucrbp)[:,1]
# Write probability estimates for ucrbp experiments
outfile = open('ucrbp_probability_estimates.txt', 'w')
outfile.write('probability.estimate\n')
for i in range(len(ucrbp_predictions)):
outfile.write(str(ucrbp_predictions[i])+'\n')
outfile.close()
# Write model coefficients to file
outfile = open("LR_coefficients.txt", 'w')
for i in range(len(lr.coef_[0])):
outfile.write(featurenames[i]+'\t'+str(lr.coef_[0][i])+'\n')
outfile.close()
filehandler = open("LR_model.sav","wb")
pickle.dump(lr,filehandler)