diff --git a/Final_Analysis.Rmd b/Final_Analysis.Rmd new file mode 100644 index 0000000..7532da1 --- /dev/null +++ b/Final_Analysis.Rmd @@ -0,0 +1,790 @@ +--- +title: "BioInfo_KCrossVal_LogReg_Updated" +author: "Charles Marks" +date: "February 2019" +output: html_document +--- + +## This Document + +The following code was utilized in the production of the manuscript entitled "Machine learning-based predictive modeling of surgical intervention in glaucoma using systemic data from electronic health records" by Sally L. Baxter, MD, MSc, Charles Marks, MPH, Tsung-Ting Kuo, PhD, Lucila Ohno-Machado, MD, PhD, Robert N. Weinreb, MD + +## Initializing rMarkdown + +Here we set options and upload the libraries necessary for our full analysis. + +```{r setup, include=FALSE} +library(tidyverse) +library(tableone) +library(PerformanceAnalytics) +library(psych) +library(kableExtra) +library(ROCR) +library(randomForest) +library(neuralnet) +options(digits = 3, scipen = 999) +knitr::opts_chunk$set(warning=FALSE, message=FALSE, error = FALSE) +``` + + +## Loading the Dataset + +This dataset was produced from UCSD EHR records. As a human subjects protection, this dataset is not being made available with the code to analyze. Concerns and questions should be directed to the corresponding author of the work. + +```{r dataset} +df <- read.csv("final_dataset.csv", header=TRUE) +``` + +## Exclusion Criteria + +So, now we need to exclude people based on certain factors. The three exclusion factors are : + +- Not having 6 months of records prior to first surgery date +- Having less than six months of records data in the system at all +- Not having any vital records in the system + +```{r pressure, echo=FALSE} + + +#this will be our final dataset +clean_data <- data.frame(matrix(nrow=0,ncol=ncol(df))) + +#set the column names to match our full data set +colnames(clean_data) <- colnames(df) + +#this will contain all patients excluded because they didn't have 6 months of records prior to their first surgery +surgery_exclude_data <- data.frame(matrix(nrow=0,ncol=ncol(df))) +colnames(surgery_exclude_data) <- colnames(df) + +#this will contain non-surgery patients with less than six months of data +newpatient_exclude_data <- data.frame(matrix(nrow=0,ncol=ncol(df))) +colnames(newpatient_exclude_data) <- colnames(df) + +#this will contain all patients with no vital records +no_vital_data_exclude_data <- data.frame(matrix(nrow=0,ncol=ncol(df))) +colnames(no_vital_data_exclude_data) <- colnames(df) + +#loop through all patients in the uploaded dataset to check +#each of the exclusion criteria +for(patient_num in 1:nrow(df)) +{ + #if patient has had surgery and has vital records + if(!is.na(df$first_surgery_date[patient_num]) && + !is.na(df$first_vitals_contact_date[patient_num])) + { + #check the time difference between first vital contact and first surgery + time_test <- difftime(df$first_surgery_date[patient_num], + df$first_vitals_contact_date[patient_num], units = "days") + #if length is greater than 6 months (ie 180 days), include + if(time_test > 180) + { + # place in the included data set + clean_data <- rbind(clean_data, df[patient_num,]) + + } # else exclude by placing in the surgery exclude table + else + { + surgery_exclude_data <- rbind(surgery_exclude_data, df[patient_num,]) + } + } #for non-surgery patients with vital records + else if(is.na(df$first_surgery_date[patient_num]) && + !is.na(df$first_vitals_contact_date[patient_num])) + { + #check if first record for patient in the system occurred prior to 6 months + #before the end of September 2018 + time_test2 <- difftime(as.Date("10/01/2018","%m/%d/%Y"), + df$first_vitals_contact_date[patient_num], units = "days") + #if 6 months or longer, include + if(time_test2 > 180) + { + clean_data <- rbind(clean_data, df[patient_num,]) + } #if less than 6 months, exclude + else { + newpatient_exclude_data <- rbind(newpatient_exclude_data, df[patient_num,]) + } + } #the remaining patients don't have vital records and therefore are excluded + else + { + no_vital_data_exclude_data <- rbind(no_vital_data_exclude_data, df[patient_num,]) + } +} +``` + +## table one + +Here is the descriptive statistics, stratified by history of glaucoma surgery. Rows from this can be taken to present a final descriptive table, if so desired. Descriptive table might not be overly meaningful tho... + +```{r tableone} +tone_vars = colnames(clean_data) +#the CreateTableOne function didn't like having these columns included +tone_vars[12] <- NA +tone_vars[28] <- NA +tone <- tableone::CreateTableOne(vars = tone_vars, strata = c("any_glaucoma_surgey") ,data = clean_data) + +# necessary step to make the kableExtra package work, this just makes the output easier +# to read for internal use +tone_df <- print(tone, printToggle = FALSE, noSpaces = TRUE) + +# prints table one +kable(tone_df) %>% + kable_styling(bootstrap_options = "striped", full_width = F) +``` + +## Bivariate Associations + +We wanted to look and see if there are bivariate associations between each of our predictor variables and our outcome, so we looped through the table, conducting univariate logistic regressions and reporting he results + +```{r Bivariate, warning=FALSE} + +## create a table for bivariate results +bi.final <- data.frame(matrix(nrow = 0, ncol = 3)) + +## loop through the columns of the data +for(i in 3:ncol(clean_data)) +{ + # gets the variable name for the column + iv <- colnames(clean_data)[i] + # we need to check for an error because the column data may not be regressable (ie only one value for the variable for all participants) + possibleError <- tryCatch( + #run the model + bi.model <- glm (any_glaucoma_surgey ~ get(iv), data = clean_data, family = + binomial(link="logit")), error = function(e) e + ) + + ## if there was no error we will run the following code to add the results to our + ## bi.final table + if(!inherits(possibleError, "error")){ + + ## another error check, this step also produced an error at times + possibleError2 <- tryCatch( + ## convert the coefficient into an OR and compute the confidence interval + + bi.results <- exp(cbind(OR = coef(bi.model), confint(bi.model))), + error = function(e) e) + ## if there was no error above we will add the results to the table of results + if(!inherits(possibleError2, "error")){ + + ## add to the final bivariate results + rownames(bi.results)[2] <- colnames(clean_data)[i] + bi.final <- rbind(bi.final, bi.results) + } + } + +} +# this table represents all variables found to be significant and their OR +bi.signif <- bi.final[which(bi.final$`2.5 %` > 1 | bi.final$`97.5 %`<1),] + +##output both tables to view the results +kable(bi.final) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +kable(bi.signif) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +``` + + +## Logisitc Regression 5-Fold Cross Validation + +So, we are going to run the 5-fold crossval 25 times. We shall utilize a bi-directional stepwise logistic regression function to run the analyses on 80% of the data set and then predict on the other fifth of the data set. Here is the code for running this. + +```{r lofreg 25 times} + +## this function runs a single fold of the 5-fold crossval +## it takes an 80% training set and a 20% test set and returns +## the results of the prediction and the actual values attempting to predict +runfold <- function(train, test) +{ + + # stepwise regression requires setting the null model + model.null = glm(any_glaucoma_surgey ~ 1, + data=train, + family = binomial(link="logit") + ) + + # stepwise regression requires setting the full model with all 48 variables + model.full= glm(any_glaucoma_surgey ~ Age + Gender +systolic_max + systolic_min + systolic_mean + diastolic_max + diastolic_min + diastolic_mean +pulse_min + pulse_max + pulse_mean +ever_hospitalized + days_hospitalized + CHF + PVD + Stroke + Dementia + Pulmonary + LiverMild + DM + DMcx + Renal + Cancer + Mets + med_class.Analgesics_._non.opioids + med_class.Analgesics_._opioids + med_class.Anti.rheumatic + med_class.Antianxiety_agents + med_class.Antiasthmatic + med_class.Anticoagulants + med_class.Anticonvulsant + med_class.Antidepressants + med_class.Antidiabetic + med_class.Antiemetics + med_class.Antihyperlipidemic + med_class.Antihypertensive + med_class.Beta_blockers + med_class.Calcium_blockers + med_class.Corticosteroids + med_class.Cough.Cold + med_class.Decongestants + med_class.Dermatological + med_class.Diuretics + med_class.Laxatives + med_class.Macrolide_antibiotics + med_class.Misc._antiinfectives + med_class.Ophthalmic+med_class.Ulcer_drugs, + data=train, + family = binomial(link="logit") + ) + + # this runs the stepwise regression and saves the regression equation + step.results <- step(model.null, + scope = list(upper=model.full), + direction="both", + test="Chisq", + data=train, trace = 0) + + # now run the final chosen model + final.model <- eval(step.results$call) + + # this applies the stepwise model and attempts to predict the results on the training set + lr.pr <- predict(final.model, test, type="response") + + # return the predicted values and the actual values + return(list(lr.pr,test$any_glaucoma_surgey)) +} + +## this function will randomize the data into five even groups and run +## the cross validations, returning a list of predicted values and a list of actual values + +fivefold <- function(data){ + + #create a vector with the appropriate number of 1,2,3,4,5 + values <- rep(1:5, length.out=nrow(data)) + #randomly sort this vector + random <- sample(values) + #add this vector to the data, essentially assigning each row a random group + data$cv_group <- random + + + ## next we want to assign the five test and training pairings + train.one <- data[which(data$cv_group!= 1),] + test.one <- data[which(data$cv_group== 1),] + train.two <- data[which(data$cv_group!= 2),] + test.two <- data[which(data$cv_group== 2),] + train.three <- data[which(data$cv_group!= 3),] + test.three <- data[which(data$cv_group== 3),] + train.four <- data[which(data$cv_group!= 4),] + test.four <- data[which(data$cv_group== 4),] + train.five <- data[which(data$cv_group!= 5),] + test.five <- data[which(data$cv_group== 5),] + + # now we run our 5 folds + res1 <- runfold(train.one, test.one) + res2 <- runfold(train.two, test.two) + res3 <- runfold(train.three, test.three) + res4 <- runfold(train.four, test.four) + res5 <- runfold(train.five, test.five) + + #save our lists of predictions and actuals + predictions <- list(res1[1][[1]],res2[1][[1]],res3[1][[1]],res4[1][[1]],res5[1][[1]]) + actuals <- list(res1[2][[1]],res2[2][[1]],res3[2][[1]],res4[2][[1]],res5[2][[1]]) + + # return the results + return(list(predictions, actuals)) + +} + +# these lists will be population with our predictions and actuals +predictions <- list() +actuals <- list() + +# this loop will run the 25 iterations of 5-crossfold +for(i in 1:25){ + + # run one iteration of the five crossfold + kfold_result <- fivefold(clean_data) + + #add the predictions and actuals for each fold + predictions <- append(predictions, kfold_result[[1]]) + actuals <- append(actuals, kfold_result[[2]]) + + # this is just for progress checking + message(i) +} + +### Let us make the average ROC curve +pred <- ROCR::prediction(predictions, actuals) + +### Lets plot the average +avg_auc_curve_log_reg <- ROCR::performance(pred,"tpr","fpr") +plot(avg_auc_curve_log_reg, avg="vertical",main='Average ROC Curve for Logistic Regression',col=2,lwd=2) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And all the curves +plot(avg_auc_curve_log_reg,main='All the ROC Curves for Logistic Regression',col=2,lwd=1) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And lets compute the average ROC curve +auc_list <- ROCR::performance(pred,"auc")@y.values + +aucs <- c() +for(num in auc_list) { aucs <- c(aucs, num)} + +auc_mean_lr <- mean(aucs) +auc_min_lr <- min(aucs) +auc_max_lr <- max(aucs) +auc_sd_lr <-sd(aucs) + +## we will save the predictions and actuals for later +## so that we can graph the average roc curves of all three +## models at the same time + +lr_predictions <- predictions +lr_actuals <- actuals + + + +``` + +### Running Stepwise Log Reg + +As well, we want to just report the logisitc regression results of the entire data set + +```{r full logreg} + +## set the null model + +model.null = glm(any_glaucoma_surgey ~ 1, + data=clean_data, + family = binomial(link="logit") +) + + +# set the full model + +model.full= glm(any_glaucoma_surgey ~ Age + Gender +systolic_max + systolic_min + systolic_mean + diastolic_max + diastolic_min + diastolic_mean +pulse_min + pulse_max + pulse_mean +ever_hospitalized + days_hospitalized + CHF + PVD + Stroke + Dementia + Pulmonary + LiverMild + DM + DMcx + Renal + Cancer + Mets + med_class.Analgesics_._non.opioids + med_class.Analgesics_._opioids + med_class.Anti.rheumatic + med_class.Antianxiety_agents + med_class.Antiasthmatic + med_class.Anticoagulants + med_class.Anticonvulsant + med_class.Antidepressants + med_class.Antidiabetic + med_class.Antiemetics + med_class.Antihyperlipidemic + med_class.Antihypertensive + med_class.Beta_blockers + med_class.Calcium_blockers + med_class.Corticosteroids + med_class.Cough.Cold + med_class.Decongestants + med_class.Dermatological + med_class.Diuretics + med_class.Laxatives + med_class.Macrolide_antibiotics + med_class.Misc._antiinfectives + med_class.Ophthalmic+med_class.Ulcer_drugs, + data=clean_data, + family = binomial(link="logit") +) + +# run the stepwise procedure + +step.results <- step(model.null, + scope = list(upper=model.full), + direction="both", + test="Chisq", + data=clean_data, trace = 0) + +# run the final model + +final.model <- eval(step.results$call) + +# create a table with the aORs and confidence intervals and p-values +results <- exp(cbind(OR = coef(final.model), confint(final.model))) +results <- cbind(results,summary(final.model)$coefficients[,4]) + +# print out the results +kable(results) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +``` + +## Random Forests CrossVal + +So, we are going to run the 5-fold crossval 25 times. We shall utilize random forests algorithm to run the analyses on 80% of the data set and then predict on the other fifth of the data set. Here is the code for running this. + +```{r rf 25 times} + +# this function runs a single fold, training on 80% of the data and testing on the other 20% +runfold_rf <- function(train, test) +{ + ## this code runs the random forests on our 48 variables + model.rf = randomForest(any_glaucoma_surgey ~ Age + Gender +systolic_max + systolic_min + systolic_mean + diastolic_max + diastolic_min + diastolic_mean +pulse_min + pulse_max + pulse_mean +ever_hospitalized + days_hospitalized + CHF + PVD + Stroke + Dementia + Pulmonary + LiverMild + DM + DMcx + Renal + Cancer + Mets + med_class.Analgesics_._non.opioids + med_class.Analgesics_._opioids + med_class.Anti.rheumatic + med_class.Antianxiety_agents + med_class.Antiasthmatic + med_class.Anticoagulants + med_class.Anticonvulsant + med_class.Antidepressants + med_class.Antidiabetic + med_class.Antiemetics + med_class.Antihyperlipidemic + med_class.Antihypertensive + med_class.Beta_blockers + med_class.Calcium_blockers + med_class.Corticosteroids + med_class.Cough.Cold + med_class.Decongestants + med_class.Dermatological + med_class.Diuretics + med_class.Laxatives + med_class.Macrolide_antibiotics + med_class.Misc._antiinfectives + med_class.Ophthalmic + med_class.Ulcer_drugs, + data=train, importance = TRUE, mtry = 6 + ) + + # this line uses the random forests model to predict the results of the test data + # the predicted values are saved + pr <- predict(model.rf, test, type="prob")[,2] + + # here we return the list of predicted and actuals + return(list(pr,test$any_glaucoma_surgey)) + +} + +## this function will randomize the data into five even groups and run +## the cross validations, returning an average AUC and the AUC curves +## of each of the cross validations +fivefold_rf <- function(data){ + + #create a vector with the appropriate number of 1,2,3,4,5 + values <- rep(1:5, length.out=nrow(data)) + #randomly sort this vector + random <- sample(values) + #add this vector to the data, essentially assigning each row a random group + data$cv_group <- random + + + ## next we want to assign the five test and training pairings + train.one <- data[which(data$cv_group!= 1),] + test.one <- data[which(data$cv_group== 1),] + train.two <- data[which(data$cv_group!= 2),] + test.two <- data[which(data$cv_group== 2),] + train.three <- data[which(data$cv_group!= 3),] + test.three <- data[which(data$cv_group== 3),] + train.four <- data[which(data$cv_group!= 4),] + test.four <- data[which(data$cv_group== 4),] + train.five <- data[which(data$cv_group!= 5),] + test.five <- data[which(data$cv_group== 5),] + + # run each of the five crossfolds + res1 <- runfold_rf(train.one, test.one) + res2 <- runfold_rf(train.two, test.two) + res3 <- runfold_rf(train.three, test.three) + res4 <- runfold_rf(train.four, test.four) + res5 <- runfold_rf(train.five, test.five) + + # make a list of predictions and actuals + predictions <- list(res1[1][[1]],res2[1][[1]],res3[1][[1]],res4[1][[1]],res5[1][[1]]) + actuals <- list(res1[2][[1]],res2[2][[1]],res3[2][[1]],res4[2][[1]],res5[2][[1]]) + + # return the lsits of predictions and actuals + return(list(predictions, actuals)) + +} + +# make the initial list to compile all the predictions and actuals +predictions <- list() +actuals <- list() + +# run 25 5 crossfolds +for(i in 1:25){ + + # run one 5 crossfold + kfold_result <- fivefold_rf(clean_data) + + # add the predictions and actuals from each run to the list of all of them + predictions <- append(predictions, kfold_result[[1]]) + actuals <- append(actuals, kfold_result[[2]]) + message(i) + + +} + + +### Let us make the average ROC curve +pred <- ROCR::prediction(predictions, actuals) + +### Lets plot the average +avg_auc_curve_rf <- ROCR::performance(pred,"tpr","fpr") +plot(avg_auc_curve_rf, avg="vertical",main='Average ROC Curve for Random Forests',col=2,lwd=2) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And all the curvers +plot(avg_auc_curve_rf,main='All the ROC Curves for Random Forest',col=2,lwd=1) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And lets compute the average ROC curve +auc_list <- ROCR::performance(pred,"auc")@y.values + +aucs <- c() +for(num in auc_list) { aucs <- c(aucs, num)} + +auc_mean_rf <- mean(aucs) +auc_min_rf <- min(aucs) +auc_max_rf <- max(aucs) +auc_sd_rf <-sd(aucs) + +## finally we will save the predictions and actuals for later analysis + +rf_predictions <- predictions +rf_actuals <- actuals + + + +``` + +### Random Forests Importance Scores + +We also want to compute the importance scores for the entire data set + +```{r RF Importance } + +# run random forests on the full dataset + +model.rf = randomForest(any_glaucoma_surgey ~ Age + Gender +systolic_max + systolic_min + systolic_mean + diastolic_max + diastolic_min + diastolic_mean +pulse_min + pulse_max + pulse_mean +ever_hospitalized + days_hospitalized + CHF + PVD + Stroke + Dementia + Pulmonary + LiverMild + DM + DMcx + Renal + Cancer + Mets + med_class.Analgesics_._non.opioids + med_class.Analgesics_._opioids + med_class.Anti.rheumatic + med_class.Antianxiety_agents + med_class.Antiasthmatic + med_class.Anticoagulants + med_class.Anticonvulsant + med_class.Antidepressants + med_class.Antidiabetic + med_class.Antiemetics + med_class.Antihyperlipidemic + med_class.Antihypertensive + med_class.Beta_blockers + med_class.Calcium_blockers + med_class.Corticosteroids + med_class.Cough.Cold + med_class.Decongestants + med_class.Dermatological + med_class.Diuretics + med_class.Laxatives + med_class.Macrolide_antibiotics + med_class.Misc._antiinfectives + med_class.Ophthalmic + med_class.Ulcer_drugs, + data=clean_data, importance = TRUE, mtry = 6 +) + +# get the results +results <- importance(model.rf) + +# output the results +kable(results) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +``` + +## ANN CrossVal + +### Make Numeric Data Set + +To Run ANN you have to use a numeric data set and everything need to be scaled as well + +```{r numericize} + +#first scale the data +numeric_data <- clean_data +# this loop goes through all of the variables and makes the non-numeric data numeric +for(i in 1:ncol(numeric_data)) +{ + if(!is.numeric(numeric_data[,i])) + { + numeric_data[,i] <- as.numeric(numeric_data[,i]) + } + + +} +# these three lines scale all of the data frame +max <- apply(numeric_data,2,max) +min <- apply(numeric_data,2,min) +numeric_data <- as.data.frame(scale(numeric_data, center = min, scale = max - min)) +``` + +### Run the CrossVal + +Like with RF and logistic regression, we will run 25 iterations of 5-fold crossvalidation. We are utilizing an artificial neural network with one hidden layer (24 hidden nodes). + +```{r ANN 25 times} + +# this runs one of the five folds +runfold_ann <- function(train, test) +{ + # fit the ANN model + model.ann <- neuralnet(any_glaucoma_surgey ~ Age + Gender +systolic_max + systolic_min + systolic_mean + diastolic_max + diastolic_min + diastolic_mean +pulse_min + pulse_max + pulse_mean +ever_hospitalized + days_hospitalized + CHF + PVD + Stroke + Dementia + Pulmonary + LiverMild + DM + DMcx + Renal + Cancer + Mets + med_class.Analgesics_._non.opioids + med_class.Analgesics_._opioids + med_class.Anti.rheumatic + med_class.Antianxiety_agents + med_class.Antiasthmatic + med_class.Anticoagulants + med_class.Anticonvulsant + med_class.Antidepressants + med_class.Antidiabetic + med_class.Antiemetics + med_class.Antihyperlipidemic + med_class.Antihypertensive + med_class.Beta_blockers + med_class.Calcium_blockers + med_class.Corticosteroids + med_class.Cough.Cold + med_class.Decongestants + med_class.Dermatological + med_class.Diuretics + med_class.Laxatives + med_class.Macrolide_antibiotics + med_class.Misc._antiinfectives + med_class.Ophthalmic + med_class.Ulcer_drugs, + data=train, hidden = c(24), threshold = 0.01 + ) + + # so, this next line, we need a dataframe that only contains the variables included in the model, these numbers represent the columns of the data frame of all the variables contained in the model...it's not pretty but it works + test.cv <- test[,c(3,4,13,14,15,17,18,19,30,31,32,36,37,38,39,40,43,44,45,47,48,50,52,53,55,56,61,63,65,66,67,68,69,72,75,76,87,88,94,95,96,97,101,108,110,115,126,136)] + + # predict + prob <- compute(model.ann,test.cv) + + # save the results of the prediction + prob.results <- prob$net.result + + # return the predicted values and the actuals + return(list(prob.results,test$any_glaucoma_surgey)) + + +} + +## this function will randomize the data into five even groups and run +## the cross validations, returning an average AUC and the AUC curves +## of each of the cross validations +fivefold_ann <- function(data){ + + #create a vector with the appropriate number of 1,2,3,4,5 + values <- rep(1:5, length.out=nrow(data)) + #randomly sort this vector + random <- sample(values) + #add this vector to the data, essentially assigning each row a random group + data$cv_group <- random + + + ## next we want to assign the five test and training pairings + train.one <- data[which(data$cv_group!= 1),] + test.one <- data[which(data$cv_group== 1),] + train.two <- data[which(data$cv_group!= 2),] + test.two <- data[which(data$cv_group== 2),] + train.three <- data[which(data$cv_group!= 3),] + test.three <- data[which(data$cv_group== 3),] + train.four <- data[which(data$cv_group!= 4),] + test.four <- data[which(data$cv_group== 4),] + train.five <- data[which(data$cv_group!= 5),] + test.five <- data[which(data$cv_group== 5),] + + # run the five folds + res1 <- runfold_ann(train.one, test.one) + res2 <- runfold_ann(train.two, test.two) + res3 <- runfold_ann(train.three, test.three) + res4 <- runfold_ann(train.four, test.four) + res5 <- runfold_ann(train.five, test.five) + + + # save the lists of predictions and actuals + predictions <- list(res1[1][[1]],res2[1][[1]],res3[1][[1]],res4[1][[1]],res5[1][[1]]) + actuals <- list(res1[2][[1]],res2[2][[1]],res3[2][[1]],res4[2][[1]],res5[2][[1]]) + + return(list(predictions, actuals)) + +} + +# the lists to hold all the predictions and actuals +predictions <- list() +actuals <- list() + +#run it 25 times +for(i in 1:25){ + + # run one 5-fold + kfold_result <- fivefold_ann(numeric_data) + + # save the predicteds and actuals + predictions <- append(predictions, kfold_result[[1]]) + actuals <- append(actuals, kfold_result[[2]]) + message(i) +} + + +### Let us make the average ROC curve +pred <- ROCR::prediction(predictions, actuals) + +### Lets plot the average +avg_auc_curve_ann <- ROCR::performance(pred,"tpr","fpr") +plot(avg_auc_curve_ann, avg="vertical",main='Average ROC Curve for ANN',col=2,lwd=2) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And all the curvers +plot(avg_auc_curve_ann,main='All the ROC Curves for Artifical Neural Network',col=2,lwd=1) +abline(a=0,b=1,lwd=2,lty=2,col='gray') + +### And lets compute the average ROC curve +auc_list <- ROCR::performance(pred,"auc")@y.values + +aucs <- c() +for(num in auc_list) { aucs <- c(aucs, num)} + +auc_mean_ann <- mean(aucs) +auc_min_ann <- min(aucs) +auc_max_ann <- max(aucs) +auc_sd_ann <-sd(aucs) + +## finally we will save the predicteds and actuals for later + +ann_predictions <- predictions +ann_actuals <- actuals + + + +``` + +### Print Average AUC Curves Together + +We wanted to print the average AUC curvers all together, here is the code for doing that + +```{r average AUCs} +plot(avg_auc_curve_ann, avg="vertical",main='Average ROC Curves', lwd = 2, pch = 0) +plot(avg_auc_curve_rf, avg="vertical",add = TRUE,lwd=2, pch = 8, lty = "dotted") +plot(avg_auc_curve_log_reg, avg="vertical",add = TRUE,lwd=2, pch = 6, lty = "twodash") +abline(a=0,b=1,lwd=2,lty=2,col='gray') +legend(0,1, legend=c("ANN", "RF", "LogReg"), lty=c('solid','dotted','twodash'), lwd =2, cex=0.8) +``` +### Here Are the AUCs and Related Stats + +First we wanted to compute statistics about the AUCs of each model. The averages, max, in, and standard deviations + +```{r output tables} + +## create the values for the table, compute the means, sd, mins, and maxs +models <- c("Logistic Regression", "Random Forest", "Artificial Neural Network") +means <- c(auc_mean_lr, auc_mean_rf, auc_mean_ann) +sds <- c(auc_sd_lr,auc_sd_rf, auc_sd_ann) +mins <- c(auc_min_lr,auc_min_rf,auc_min_ann) +maxs <- c(auc_max_lr, auc_max_rf, auc_max_ann) + +# place values into a single dataframe +auc_table <- data.frame(matrix(nrow = 3, ncol = 0)) +auc_table$model <- models +auc_table$auc_mean <- round(means, digits = 3) +auc_table$auc_sd <- round(sds, digits = 3) +auc_table$auc_min <- round(mins, digits = 3) +auc_table$auc_max <- round(maxs, digits = 3) + +# print it out +kable(auc_table) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +``` + +### CutPoint + +We also want to compute the specificity, sensitivity and accuracy of the model. We used the oc_youden_kernel in the cutpointr function in order to identify this point. + +```{r cutpoint} + +## for finding cutpoint +library(cutpointr) + + +### Logisitic Regression +lr_sensitivity_sum <- 0 +lr_specificity_sum <- 0 +lr_cutpoint_sum <- 0 +lr_accuracy_sum <- 0 + +## we went through each of the 25 runs and computed the metrics for each +for(i in 1:length(lr_actuals)) { + cp <- cutpointr(x = lr_predictions[[i]], class = lr_actuals[[i]], method = oc_youden_kernel) + + sum_cp <-summary(cp)$cutpointr[[1]] + + lr_sensitivity_sum <- lr_sensitivity_sum + sum_cp$sensitivity + lr_specificity_sum <- lr_specificity_sum + sum_cp$specificity + lr_cutpoint_sum <- lr_cutpoint_sum + sum_cp$optimal_cutpoint + lr_accuracy_sum <- lr_accuracy_sum + sum_cp$acc +} + +## then we reported the averages of each value for each run +lr_mean_sensitivity <- lr_sensitivity_sum/length(lr_actuals) +lr_mean_specificity <- lr_specificity_sum/length(lr_actuals) +lr_mean_cutpoint <- lr_cutpoint_sum/length(lr_actuals) +lr_mean_accuracy <- lr_accuracy_sum/length(lr_actuals) + +## Random Forests + +rf_sensitivity_sum <- 0 +rf_specificity_sum <- 0 +rf_cutpoint_sum <- 0 +rf_accuracy_sum <- 0 + +# compute metrics for each run +for(i in 1:length(rf_actuals)) { + cp <- cutpointr(x = rf_predictions[[i]], class = rf_actuals[[i]], method = oc_youden_kernel) + + sum_cp <-summary(cp)$cutpointr[[1]] + + rf_sensitivity_sum <- rf_sensitivity_sum + sum_cp$sensitivity + rf_specificity_sum <- rf_specificity_sum + sum_cp$specificity + rf_cutpoint_sum <- rf_cutpoint_sum + sum_cp$optimal_cutpoint + rf_accuracy_sum <- rf_accuracy_sum + sum_cp$acc +} + +#average the results across all 25 folds +rf_mean_sensitivity <- rf_sensitivity_sum/length(rf_actuals) +rf_mean_specificity <- rf_specificity_sum/length(rf_actuals) +rf_mean_cutpoint <- rf_cutpoint_sum/length(rf_actuals) +rf_mean_accuracy <- rf_accuracy_sum/length(rf_actuals) + +### ANN + +ann_sensitivity_sum <- 0 +ann_specificity_sum <- 0 +ann_cutpoint_sum <- 0 +ann_accuracy_sum <- 0 + +for(i in 1:length(ann_predictions)) { + ann_predictions[[i]] <- ann_predictions[[i]][,1] +} + +for(i in 1:length(ann_actuals)) { + cp <- cutpointr(x = ann_predictions[[i]], class = ann_actuals[[i]], method = oc_youden_kernel) + + sum_cp <-summary(cp)$cutpointr[[1]] + + ann_sensitivity_sum <- ann_sensitivity_sum + sum_cp$sensitivity + ann_specificity_sum <- ann_specificity_sum + sum_cp$specificity + ann_cutpoint_sum <- ann_cutpoint_sum + sum_cp$optimal_cutpoint + ann_accuracy_sum <- ann_accuracy_sum + sum_cp$acc +} + +ann_mean_sensitivity <- ann_sensitivity_sum/length(ann_actuals) +ann_mean_specificity <- ann_specificity_sum/length(ann_actuals) +ann_mean_cutpoint <- ann_cutpoint_sum/length(ann_actuals) +ann_mean_accuracy <- ann_accuracy_sum/length(ann_actuals) + + +## create the final table +models <- c("Logistic Regression", "Random Forest", "Artificial Neural Network") +sensitivitys <- c(lr_mean_sensitivity,rf_mean_sensitivity, ann_mean_sensitivity ) +specificitys <- c(lr_mean_specificity,rf_mean_specificity, ann_mean_specificity ) +accuracys <- c(lr_mean_accuracy,rf_mean_accuracy, ann_mean_accuracy) +cutpoints <- c(lr_mean_cutpoint,rf_mean_cutpoint, ann_mean_cutpoint ) + +table <- data.frame(matrix(nrow = 3, ncol = 0)) +table$model <- models +table$sensitivity <- round(sensitivitys, digits = 3) +table$specificity <- round(specificitys, digits = 3) +table$accuracy <- round(accuracys, digits = 3) +table$cutpoint <- round(cutpoints, digits = 3) + + +## print it all out +kable(table) %>% + kable_styling(bootstrap_options = "striped", full_width = F) + +``` +