forked from OHDSI/ShinyDeploy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessing.R
230 lines (199 loc) · 12.5 KB
/
processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# this checked whether input is valid analysis location or plpResult
checkPlpInput <- function(result){
if(class(result)=='runPlp'){
return('plpResult')
} else if(ifelse(class(result)=='character', dir.exists(result),F)){
return('file')
} else if(sum(names(result)%in%c("prediction","performanceEvaluation","inputSetting","executionSummary","model","analysisRef","covariateSummary"))==7){
return('plpNoClass')
} else {
stop('Incorrect class for input result')
}
}
getSummary <- function(result,inputType,validation){
if(inputType == 'plpResult' || inputType == 'plpNoClass'){
sumTab <- getSummaryFromObject(result,validation)
} else if( inputType == 'file') {
sumTab <- summaryPlpAnalyses(result)
}
#remove empty rows
emptyInd <- is.na(sumTab[,'AUC'])
if(sum(emptyInd)>0){
sumTab <- sumTab[!emptyInd,]
}
#sumTab <- sumTab[,c('analysisId','devDatabase','valDatabase','cohortName','outcomeName','modelSettingName','riskWindowStart', 'riskWindowEnd', 'AUC','AUPRC', 'populationSize','outcomeCount','incidence',
# 'addExposureDaysToStart','addExposureDaysToEnd','plpResultLocation', 'plpResultLoad')]
#colnames(sumTab) <- c('Analysis','Dev', 'Val', 'T', 'O','Model', 'TAR start', 'TAR end', 'AUC','AUPRC', 'T Size','O Count','O Incidence (%)', 'addExposureDaysToStart','addExposureDaysToEnd', 'plpResultLocation', 'plpResultLoad')
sumTab <- sumTab[,c('devDatabase','valDatabase','cohortName','outcomeName','modelSettingName','riskWindowStart', 'riskWindowEnd', 'AUC','AUPRC', 'populationSize','outcomeCount','incidence',
'addExposureDaysToStart','addExposureDaysToEnd','plpResultLocation', 'plpResultLoad')]
colnames(sumTab) <- c('Dev', 'Val', 'T', 'O','Model', 'TAR start', 'TAR end', 'AUC','AUPRC', 'T Size','O Count','O Incidence (%)', 'addExposureDaysToStart','addExposureDaysToEnd', 'plpResultLocation', 'plpResultLoad')
return(sumTab)
}
getSummaryFromObject <- function(result,validation=NULL){
eval <- as.data.frame(result$performanceEvaluation$evaluationStatistics)
eval <- eval[eval$Eval %in% c('test',"validation"),]
allRes <- data.frame(analysisId = 1,
devDatabase = ifelse(is.null(result$inputSetting$dataExtrractionSettings$cdmDatabaseSchema),'Missing',result$inputSetting$dataExtrractionSettings$cdmDatabaseSchema),
valDatabase = ifelse(is.null(result$inputSetting$dataExtrractionSettings$cdmDatabaseSchema),'Missing',result$inputSetting$dataExtrractionSettings$cdmDatabaseSchema),
cohortName = 'T',
outcomeName = 'O',
modelSettingName = result$model$modelSettings$model,
riskWindowStart = ifelse(is.null(result$model$populationSettings$riskWindowStart), 'Missing',result$model$populationSettings$riskWindowStart),
riskWindowEnd = ifelse(is.null(result$model$populationSettings$riskWindowEnd), 'Missing',result$model$populationSettings$riskWindowEnd),
AUC = as.double(as.character(eval$Value[eval$Metric=='AUC.auc'])),
AUPRC = as.double(as.character(eval$Value[eval$Metric=='AUPRC'])),
populationSize = as.double(as.character(eval$Value[eval$Metric=='populationSize'])),
outcomeCount = as.double(as.character(eval$Value[eval$Metric=='outcomeCount'])),
incidence = as.double(as.character(eval$Value[eval$Metric=='outcomeCount']))/as.double(as.character(eval$Value[eval$Metric=='populationSize'])),
addExposureDaysToStart = ifelse(is.null(result$model$populationSettings$addExposureDaysToStart),'Missing',result$model$populationSettings$addExposureDaysToStart),
addExposureDaysToEnd = ifelse(is.null(result$model$populationSettings$addExposureDaysToEnd), 'Missing', result$model$populationSettings$addExposureDaysToEnd),
plpResultLocation = 'NULL',
plpResultLoad = 'NULL'
)
if(!is.null(validation)){
for(i in 1:length(validation$validation)){
eval <- as.data.frame(validation$validation[[i]]$performanceEvaluation$evaluationStatistics)
tempRes <-data.frame(analysisId = 1+i,
devDatabase = result$inputSetting$dataExtrractionSettings$cdmDatabaseSchema,
valDatabase = names(validation)[i],
cohortName = 'T',
outcomeName = 'O',
modelSettingName = result$model$modelSettings$model,
riskWindowStart = result$model$populationSettings$riskWindowStart,
riskWindowEnd = result$model$populationSettings$riskWindowEnd,
AUC = as.double(as.character(eval$Value[eval$Metric=='AUC.auc'])),
AUPRC = as.double(as.character(eval$Value[eval$Metric=='AUPRC'])),
populationSize = as.double(as.character(eval$Value[eval$Metric=='populationSize'])),
outcomeCount = as.double(as.character(eval$Value[eval$Metric=='outcomeCount'])),
incidence = as.double(as.character(eval$Value[eval$Metric=='outcomeCount']))/as.double(as.character(eval$Value[eval$Metric=='populationSize'])),
addExposureDaysToStart = result$model$populationSettings$addExposureDaysToStart,
addExposureDaysToEnd = result$model$populationSettings$addExposureDaysToEnd,
plpResultLocation = 'NULL',
plpResultLoad = 'NULL'
)
allRes <- rbind(tempRes, allRes)
}
}
return(allRes)
}
# old functions:
summaryPlpAnalyses <- function(analysesLocation){
# loads the analyses and validations to get summaries
#========================================
settings <- read.csv(file.path(analysesLocation,'settings.csv'))
settings <- settings[,!colnames(settings)%in%c('plpDataFolder','studyPopFile','plpResultFolder')]
settings$analysisId <- paste0('Analysis_', settings$analysisId)
analysisIds <- dir(file.path(analysesLocation), recursive = F, full.names = T)
analysisIds <- analysisIds[grep('Analysis_',analysisIds)]
if(is.null(settings$devDatabase)){
settings$devDatabase <- 'Missing'
}
settings$valDatabase <- settings$devDatabase
devPerformance <- do.call(rbind,lapply(file.path(analysisIds), getPerformance))
devPerformance <- merge(settings[,c('analysisId','modelSettingsId', 'cohortName', 'outcomeName',
'populationSettingId','modelSettingName','addExposureDaysToStart',
'riskWindowStart', 'addExposureDaysToEnd',
'riskWindowEnd','devDatabase','valDatabase')],
devPerformance, by='analysisId', all.x=T)
validationLocation <- file.path(analysesLocation,'Validation')
if(length(dir(validationLocation))>0){
valPerformances <- c()
valDatabases <- dir(validationLocation, recursive = F, full.names = T)
if(length(grep('plplog.txt', valDatabases))>0){
valDatabases <- valDatabases[-grep('plplog.txt', valDatabases)]
}
for( valDatabase in valDatabases){
valAnalyses <- dir(valDatabase, recursive = F, full.names = T)
valAnalyses <- valAnalyses[grep('Analysis_', valAnalyses)]
valPerformance <- do.call(rbind,lapply(file.path(valAnalyses), function(x) getValidationPerformance(x)))
valSettings <- settings[,c('analysisId','modelSettingsId', 'cohortName', 'outcomeName',
'populationSettingId','modelSettingName','addExposureDaysToStart',
'riskWindowStart', 'addExposureDaysToEnd',
'riskWindowEnd','devDatabase')]
#valSettings$devDatabase <- settings$devDatabase[1]
valPerformance <- merge(valSettings,
valPerformance, by='analysisId')
valPerformance <- valPerformance[,colnames(devPerformance)] # make sure same order
valPerformances <- rbind(valPerformances, valPerformance)
}
if(ncol(valPerformances)==ncol(devPerformance)){
allPerformance <- rbind(devPerformance,valPerformances)
} else{
stop('Issue with dev and val performance data.frames')
}
} else {
allPerformance <- devPerformance
}
allPerformance$AUC <- as.double(allPerformance$AUC)
allPerformance$AUPRC <- as.double(allPerformance$AUPRC)
allPerformance$outcomeCount <- as.double(allPerformance$outcomeCount)
allPerformance$populationSize <- as.double(allPerformance$populationSize)
allPerformance$incidence <- as.double(allPerformance$incidence)
return(allPerformance)
}
getPerformance <- function(analysisLocation){
location <- file.path(analysisLocation, 'plpResult.rds')
if(!file.exists(location)){
# check for PLP file instead
locationPlp <- file.path(analysisLocation, 'plpResult')
if(!dir.exists(locationPlp)){
analysisId <- strsplit(analysisLocation, '/')[[1]]
return(data.frame(analysisId=analysisId[length(analysisId)],
AUC=0.000, AUPRC=0, outcomeCount=0,
populationSize=0,incidence=0,plpResultLocation=location,
plpResultLoad='loadPlpResult'))
} else {
require(PatientLevelPrediction)
res <- loadPlpResult(file.path(analysisLocation,'plpResult'))
res <- as.data.frame(res$performanceEvaluation$evaluationStatistics)
location <- file.path(analysisLocation, 'plpResult')
plpResultLoad <- 'loadPlpResult'
}
} else{
# read rds here
res <- readRDS(file.path(analysisLocation,'plpResult.rds'))
res <- as.data.frame(res$performanceEvaluation$evaluationStatistics)
plpResultLoad <- 'readRDS'
}
#if empty do edit?
res <- tryCatch(reshape2::dcast(res[res$Eval=='test',], analysisId ~ Metric, value.var='Value'),
error = function(cont) return(NULL))
if(is.null(res)){
return(NULL) }
res <- res[,!colnames(res)%in%c("BrierScore","BrierScaled")]
res$incidence <- as.double(res$outcomeCount)/as.double(res$populationSize)*100
res[, !colnames(res)%in%c('analysisId','outcomeCount','populationSize')] <-
format(as.double(res[, !colnames(res)%in%c('analysisId','outcomeCount','populationSize')]), digits = 2, scientific = F)
if(sum(colnames(res)=='AUC.auc_ub95ci')>0){
res$AUC <- res$AUC.auc
#res$AUC <- paste0(res$AUC.auc, ' (', res$AUC.auc_lb95ci,'-', res$AUC.auc_ub95ci,')')
}
res$plpResultLocation <- location
res$plpResultLoad <- plpResultLoad
return(res[,c('analysisId', 'AUC', 'AUPRC', 'outcomeCount','populationSize','incidence','plpResultLocation', 'plpResultLoad')])
}
getValidationPerformance <- function(validationLocation){
val <- readRDS(file.path(validationLocation,'validationResult.rds'))
if("performanceEvaluation"%in%names(val)){
valPerformance <- reshape2::dcast(as.data.frame(val$performanceEvaluation$evaluationStatistics),
analysisId ~ Metric, value.var='Value')
} else {
valPerformance <- reshape2::dcast(as.data.frame(val[[1]]$performanceEvaluation$evaluationStatistics),
analysisId ~ Metric, value.var='Value')
}
valPerformance$incidence <- as.double(valPerformance$outcomeCount)/as.double(valPerformance$populationSize)*100
valPerformance[, !colnames(valPerformance)%in%c('analysisId','outcomeCount','populationSize')] <-
format(as.double(valPerformance[, !colnames(valPerformance)%in%c('analysisId','outcomeCount','populationSize')]), digits = 2, scientific = F)
if(sum(colnames(valPerformance)=='AUC.auc_ub95ci')>0){
valPerformance$AUC <- valPerformance$AUC.auc
#valPerformance$AUC <- paste0(valPerformance$AUC.auc, ' (', valPerformance$AUC.auc_lb95ci,'-', valPerformance$AUC.auc_ub95ci,')')
}
valPerformance$analysisId <- strsplit(validationLocation, '/')[[1]][[length(strsplit(validationLocation, '/')[[1]])]]
valPerformance$valDatabase <- strsplit(validationLocation, '/')[[1]][[length(strsplit(validationLocation, '/')[[1]])-1]]
valPerformance <- valPerformance[,c('analysisId','valDatabase', 'AUC', 'AUPRC', 'outcomeCount','populationSize','incidence')]
valPerformance$plpResultLocation <- file.path(validationLocation,'validationResult.rds')
valPerformance$plpResultLoad <- 'readRDS'
#valPerformance$rocplot <- file.path(validationLocation,'plots','sparseROC.pdf')
#valPerformance$calplot <- file.path(validationLocation,'plots','sparseCalibrationConventional.pdf')
return(valPerformance)
}