Add files via upload

morrislab · Oct 18, 2022 · 041d04c · 041d04c
1 parent 22409ef
commit 041d04c
Show file tree

Hide file tree

Showing 7 changed files with 2,386 additions and 0 deletions.
diff --git a/RNAcompete_LR_classifier.py b/RNAcompete_LR_classifier.py
@@ -0,0 +1,104 @@
+import warnings
+warnings.filterwarnings('ignore')
+import sys
+import os
+import pickle
+import numpy as np
+from sklearn.metrics import roc_auc_score
+from skopt import BayesSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from skopt.space import Real
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn import metrics
+from sklearn.model_selection import cross_val_score 
+
+# Read in training data
+trainfile = "training_set.txt"
+trainset = np.loadtxt(trainfile, delimiter='\t', skiprows=1)
+
+X = trainset[:, 1:len(trainset[0])]
+Y = trainset[:, 0]
+
+# Scale data
+scaler = StandardScaler()
+scaler.fit(X)
+X = scaler.transform(X)
+
+# Read in feature names
+afile = open(trainfile, 'r')
+featurenames = afile.readline().strip().split('\t')[1:]
+afile.close()
+
+# Read in and scale testing data
+testfile = "testing_set.txt"
+testset = np.loadtxt(testfile, delimiter='\t', skiprows=1)
+Xtest = testset[:, 1:len(testset[0])]
+Ytest = testset[:, 0]
+Xtest = scaler.transform(Xtest)
+
+# Read in and scale ucRBP experiment data
+ucrbpfile = "ucrbp_experiment_features.txt"
+Xucrbp = np.loadtxt(ucrbpfile, delimiter='\t', skiprows=1)
+Xucrbp = scaler.transform(Xucrbp)
+
+# PREPARE TO RUN LOGISTIC REGRESSION
+lr = LogisticRegression(solver='liblinear')
+
+# DEFINE PARAMETER GRID
+param_grid = {
+    'penalty': ['l1'],
+    'solver': ['liblinear'],
+    'C': Real(low=1e-6, high=100, prior='log-uniform'),
+}
+
+# SET UP OPTIMIZER
+opt = BayesSearchCV(
+    lr,
+    param_grid,
+    n_iter=30,
+    random_state=1234,
+    verbose=0,
+    n_jobs = 1
+)
+
+opt.fit(X, Y)
+
+# FIT PARAMETERS
+lr = LogisticRegression(**opt.best_params_)
+lr.fit(X, Y)    
+
+# Get probability estimates and AUROC for test set
+test_predictions = lr.predict_proba(Xtest)[:,1]
+test_auroc = roc_auc_score(Ytest, test_predictions)
+
+# Write probability estimates and AUROC for test set
+outfile = open('test_set_probability_estimates.txt', 'w')
+outfile.write('probability.estimate\n')
+for i in range(len(test_predictions)):
+    outfile.write(str(Ytest[i])+'\t'+str(test_predictions[i])+'\n')
+outfile.close()
+
+outfile = open('test_set_AUROC.txt', 'w')
+outfile.write(str(test_auroc)+"\n")
+outfile.close()
+
+# Get probability estimates for ucrbp experiments
+ucrbp_predictions = lr.predict_proba(Xucrbp)[:,1]
+
+# Write probability estimates for ucrbp experiments
+outfile = open('ucrbp_probability_estimates.txt', 'w')
+outfile.write('probability.estimate\n')
+for i in range(len(ucrbp_predictions)):
+    outfile.write(str(ucrbp_predictions[i])+'\n')
+outfile.close()
+
+# Write model coefficients to file
+outfile = open("LR_coefficients.txt", 'w')
+for i in range(len(lr.coef_[0])):
+    outfile.write(featurenames[i]+'\t'+str(lr.coef_[0][i])+'\n')
+outfile.close()
+
+filehandler = open("LR_model.sav","wb")
+pickle.dump(lr,filehandler)
diff --git a/test_IDs.txt b/test_IDs.txt
@@ -0,0 +1,41 @@
+ID
+RNCMPT01016
+RNCMPT01027
+RNCMPT01028
+RNCMPT01058
+RNCMPT01099
+RNCMPT01100
+RNCMPT01462
+RNCMPT01469
+RNCMPT01554
+RNCMPT01560
+RNCMPT01668
+RNCMPT01787
+RNCMPT00300
+RNCMPT00312
+RNCMPT00335
+RNCMPT00336
+RNCMPT00337
+RNCMPT00338
+RNCMPT00409
+RNCMPT00410
+RNCMPT00459
+RNCMPT00498
+RNCMPT00512
+RNCMPT00520
+RNCMPT00553
+RNCMPT00570
+RNCMPT00588
+RNCMPT00589
+RNCMPT00590
+RNCMPT00593
+RNCMPT00709
+RNCMPT00777
+RNCMPT00778
+RNCMPT00797
+RNCMPT00811
+RNCMPT00829
+RNCMPT00841
+RNCMPT00867
+RNCMPT00933
+RNCMPT00954