Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
kaitlin309 authored Oct 18, 2022
1 parent 22409ef commit 041d04c
Show file tree
Hide file tree
Showing 7 changed files with 2,386 additions and 0 deletions.
104 changes: 104 additions & 0 deletions RNAcompete_LR_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pickle
import numpy as np
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from skopt.space import Real
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Read in training data
trainfile = "training_set.txt"
trainset = np.loadtxt(trainfile, delimiter='\t', skiprows=1)

X = trainset[:, 1:len(trainset[0])]
Y = trainset[:, 0]

# Scale data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Read in feature names
afile = open(trainfile, 'r')
featurenames = afile.readline().strip().split('\t')[1:]
afile.close()

# Read in and scale testing data
testfile = "testing_set.txt"
testset = np.loadtxt(testfile, delimiter='\t', skiprows=1)
Xtest = testset[:, 1:len(testset[0])]
Ytest = testset[:, 0]
Xtest = scaler.transform(Xtest)

# Read in and scale ucRBP experiment data
ucrbpfile = "ucrbp_experiment_features.txt"
Xucrbp = np.loadtxt(ucrbpfile, delimiter='\t', skiprows=1)
Xucrbp = scaler.transform(Xucrbp)

# PREPARE TO RUN LOGISTIC REGRESSION
lr = LogisticRegression(solver='liblinear')

# DEFINE PARAMETER GRID
param_grid = {
'penalty': ['l1'],
'solver': ['liblinear'],
'C': Real(low=1e-6, high=100, prior='log-uniform'),
}

# SET UP OPTIMIZER
opt = BayesSearchCV(
lr,
param_grid,
n_iter=30,
random_state=1234,
verbose=0,
n_jobs = 1
)

opt.fit(X, Y)

# FIT PARAMETERS
lr = LogisticRegression(**opt.best_params_)
lr.fit(X, Y)

# Get probability estimates and AUROC for test set
test_predictions = lr.predict_proba(Xtest)[:,1]
test_auroc = roc_auc_score(Ytest, test_predictions)

# Write probability estimates and AUROC for test set
outfile = open('test_set_probability_estimates.txt', 'w')
outfile.write('probability.estimate\n')
for i in range(len(test_predictions)):
outfile.write(str(Ytest[i])+'\t'+str(test_predictions[i])+'\n')
outfile.close()

outfile = open('test_set_AUROC.txt', 'w')
outfile.write(str(test_auroc)+"\n")
outfile.close()

# Get probability estimates for ucrbp experiments
ucrbp_predictions = lr.predict_proba(Xucrbp)[:,1]

# Write probability estimates for ucrbp experiments
outfile = open('ucrbp_probability_estimates.txt', 'w')
outfile.write('probability.estimate\n')
for i in range(len(ucrbp_predictions)):
outfile.write(str(ucrbp_predictions[i])+'\n')
outfile.close()

# Write model coefficients to file
outfile = open("LR_coefficients.txt", 'w')
for i in range(len(lr.coef_[0])):
outfile.write(featurenames[i]+'\t'+str(lr.coef_[0][i])+'\n')
outfile.close()

filehandler = open("LR_model.sav","wb")
pickle.dump(lr,filehandler)
41 changes: 41 additions & 0 deletions test_IDs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
ID
RNCMPT01016
RNCMPT01027
RNCMPT01028
RNCMPT01058
RNCMPT01099
RNCMPT01100
RNCMPT01462
RNCMPT01469
RNCMPT01554
RNCMPT01560
RNCMPT01668
RNCMPT01787
RNCMPT00300
RNCMPT00312
RNCMPT00335
RNCMPT00336
RNCMPT00337
RNCMPT00338
RNCMPT00409
RNCMPT00410
RNCMPT00459
RNCMPT00498
RNCMPT00512
RNCMPT00520
RNCMPT00553
RNCMPT00570
RNCMPT00588
RNCMPT00589
RNCMPT00590
RNCMPT00593
RNCMPT00709
RNCMPT00777
RNCMPT00778
RNCMPT00797
RNCMPT00811
RNCMPT00829
RNCMPT00841
RNCMPT00867
RNCMPT00933
RNCMPT00954
Loading

0 comments on commit 041d04c

Please sign in to comment.