diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index a21d18af..1ffccd1b 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -39,7 +39,7 @@ env: run_pkgdown: 'false' has_RUnit: 'false' cache-version: 'cache-v1' - run_docker: 'true' + run_docker: 'false' jobs: build-check: @@ -52,9 +52,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.1', bioc: '3.14', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: '4.1', bioc: '3.14'} - - { os: windows-latest, r: '4.1', bioc: '3.14'} + - { os: ubuntu-latest, r: '4.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: '4.2', bioc: '3.16'} + - { os: windows-latest, r: '4.2', bioc: '3.16'} ## Check https://github.com/r-lib/actions/tree/master/examples ## for examples using the http-user-agent env: @@ -79,12 +79,12 @@ jobs: ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml ## If they update their steps, we will also need to update ours. - name: Checkout Repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 ## R is already included in the Bioconductor docker images - name: Setup R from r-lib if: runner.os != 'Linux' - uses: r-lib/actions/setup-r@master + uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} @@ -92,7 +92,7 @@ jobs: ## pandoc is already included in the Bioconductor docker images - name: Setup pandoc from r-lib if: runner.os != 'Linux' - uses: r-lib/actions/setup-pandoc@master + uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | @@ -102,19 +102,19 @@ jobs: - name: Restore R package cache if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- - name: Install Linux system dependencies if: runner.os == 'Linux' @@ -176,7 +176,7 @@ jobs: gha_repos <- if( .Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin" ) c( - "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.14/bioc", + "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.16/bioc", BiocManager::repositories() ) else BiocManager::repositories() @@ -218,7 +218,7 @@ jobs: - name: Install pkgdown if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | - remotes::install_github("r-lib/pkgdown") + remotes::install_cran("pkgdown") shell: Rscript {0} - name: Session info @@ -236,7 +236,7 @@ jobs: options(crayon.enabled = TRUE) rcmdcheck::rcmdcheck( args = c("--no-manual", "--no-vignettes", "--timings"), - build_args = c("--no-manual", "--keep-empty-dirs", "--resave-data"), + build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), error_on = "warning", check_dir = "check" ) @@ -275,31 +275,47 @@ jobs: if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: R CMD INSTALL . - - name: Build and deploy pkgdown site + - name: Build pkgdown site if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: | - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" - shell: bash {0} + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) ## at least one locally before this will work. This creates the gh-pages ## branch (erasing anything you haven't version controlled!) and ## makes the git history recognizable by pkgdown. + - name: Install deploy dependencies + if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + apt-get update && apt-get -y install rsync + + - name: Deploy pkgdown site to GitHub pages 🚀 + if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + clean: false + branch: gh-pages + folder: docs + - name: Upload check results if: failure() uses: actions/upload-artifact@master with: - name: ${{ runner.os }}-biocversion-devel-r-4.1-results + name: ${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-results path: check - - name: Build & push Docker image v5 - if: runner.os == 'Linux' - uses: mr-smithers-excellent/docker-build-push@v5 + ## Note that DOCKER_PASSWORD is really a token for your dockerhub + ## account, not your actual dockerhub account password. + ## This comes from + ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html#6-add-secrets-to-github-repo + ## Check https://github.com/docker/build-push-action/tree/releases/v1 + ## for more details. + - uses: docker/build-push-action@v1 + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && runner.os == 'Linux' " with: - image: shraddhapai/netdx_devenv - tags: latest - registry: docker.io username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} + repository: realpailab/netdx + tag_with_ref: true + tag_with_sha: true + tags: latest diff --git a/.github/workflows/push-docker.yml b/.github/workflows/push-docker.yml new file mode 100644 index 00000000..7a9aed3e --- /dev/null +++ b/.github/workflows/push-docker.yml @@ -0,0 +1,24 @@ +name: Docker Build + +on: + push: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + name: Check out code + + - uses: mr-smithers-excellent/docker-build-push@v5 + name: Build and push Docker image + with: + image: realpailab/netdx + registry: docker.io + addLatest: 'true' + addTimestamp: 'true' + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index a535e8d7..ff7e0028 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata doc Meta +inst/doc diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..cdfe39c7 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,30 @@ +# YAML 1.2 +--- +authors: + - + family-names: Pai + given-names: Shraddha + - + family-names: Shah + given-names: Ahmad + - + family-names: Hui + given-names: Shirley + - + family-names: Isserlin + given-names: Ruth + - + family-names: Kaka + given-names: Hussam + - + family-names: Bader + given-names: Gary +cff-version: "1.1.0" +date-released: 2019 +doi: "10.15252/msb.20188497" +license: MIT +message: "If you use this software, please cite it using these metadata." +repository-code: "https://github.com/RealPaiLab/netDx" +title: "netDx: Network-based patient classifier" +version: "1.5.5" +... \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index b03de76d..8dfb51f6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: netDx Title: Network-based patient classifier -Version: 1.5.3 +Version: 1.7.1 Authors@R: c(person("Shraddha", "Pai", email = "shraddha.pai@utoronto.ca", role = c("aut", "cre"), @@ -9,6 +9,8 @@ Authors@R: c(person("Shraddha", "Pai", person("Ahmad","Shah", role="aut"), person("Luca","Giudice",role="aut"), person("Shirley","Hui",role="aut"), + person("Anne","Nøhr",role="ctb"), + person("Indy","Ng",role="ctb"), person("Ruth","Isserlin",role="aut"), person("Hussam","Kaka", role="aut"), person("Gary","Bader",role="aut")) @@ -17,15 +19,11 @@ Depends: R (>= 3.6) Suggests: curatedTCGAData, - TCGAutils, rmarkdown, testthat, knitr, BiocStyle, - RCy3, - clusterExperiment, - netSmooth, - scater + RCy3 Imports: ROCR,pracma,ggplot2,glmnet,igraph,reshape2, parallel,stats,utils,MultiAssayExperiment,graphics,grDevices, methods,BiocFileCache,GenomicRanges, diff --git a/NAMESPACE b/NAMESPACE index 94091be0..e3a6ed0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,9 +12,12 @@ export(compileFeatureScores) export(compileFeatures) export(confusionMatrix) export(convertProfileToNetworks) +export(convertToMAE) export(countIntType) export(countIntType_batch) export(countPatientsInNet) +export(createInputForFeatureNetworkView) +export(createNetFuncFromSimList) export(createPSN_MultiData) export(dataList2List) export(enrichLabelNets) @@ -34,7 +37,6 @@ export(getPerformance) export(getRegionOL) export(getResults) export(getSimilarity) -export(makeInputForEnrichmentMap) export(makePSN_NamedMatrix) export(makePSN_RangeSets) export(makeQueries) @@ -42,7 +44,6 @@ export(makeSymmetric) export(mapNamedRangesToSets) export(normDiff) export(perfCalc) -export(plotEmap) export(plotIntegratedPatientNetwork) export(plotPerf) export(plotPerf_multi) @@ -59,14 +60,14 @@ export(setupFeatureDB) export(sim.eucscale) export(sim.pearscale) export(simpleCap) -export(smoothMutations_LabelProp) export(sparsify2) export(sparsify3) export(splitTestTrain) export(splitTestTrain_resampling) +export(subsampleValidationData) export(tSNEPlotter) -export(thresholdSmoothedMutations) export(updateNets) +export(viewSelectedFeaturesAsNetworks) export(writeNetsSIF) export(writeQueryBatchFile) export(writeQueryFile) diff --git a/NEWS b/NEWS index 9f0ed121..5220ade7 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,32 @@ -netDx 1.5.3 +3.14 RELEASE +================== + +netDx 1.7.1 +================== +* Added vignette showing how to view top features as connected networks in Cytoscape (Enrichment Map). +* Improved Java support. Includes: + * Prompt to users to install Java if not installed, and requirement check for Java 16 or lower. + * Feature selection with Java 12+ resulted in an "illegal access" error due to a change in how Java handles reflection. netDx sidesteps + the error for Java 9-16 but cannot do so for Java 17. Java 12-16 should now work with feature selection. +* plotPerf() now provides the option to not plot the ROC and PR curves with a drawPlot parameter. + +netDx 1.5.10 +================== +* Functions to create network visualizations of top-scoring functions renamed to better reflect function. + * plotEMap() -> viewSelectedFeaturesAsNetworks + * makeInputForEnrichmentMap() -> createInputForFeatureNetworkView() + +netDx 1.5.9 +================== +* Ability to smooth somatic mutations over an interaction network via (smoothMutations_LabelProp) has been temporarily removed. This function made software install challenging on machines without C++ compilers as it relies on clusterExperiment. The latter can only be compiled from source. This functionality will be added back when doing so does not complicate the install. It is unclear whether users want to routinely use this functionality. + +netDx 1.5.8 ================== * Moved RCy3, scater, clusterExperiment and netSmooth to "Suggests" to reduce dependency burden * Sped up vignettes by limiting all to binary classification and limiting number of layers * Removed TL;DR from vignettes as usefulness in question but maintainance high. -Developers notes: +Developer notes: ------------------- * Added Dockerfile and Github Actions for automated testing * GHA auto-generates a Docker image with netDx which gets pushed to shraddhapai/netdx_devenv diff --git a/R/buildPredictor.R b/R/buildPredictor.R index a84fcebb..e1e7886f 100644 --- a/R/buildPredictor.R +++ b/R/buildPredictor.R @@ -29,6 +29,9 @@ #' So keys(groupList[["rna"]]) would have pathway names, generating one PSN #' per pathways, and values(groupList[["rna"]]) would be genes that would be #' grouped for the corresponding pathwayList. +#' @param sims (list) rules to create similarity networks from input data. Keys are names of +#' data layers and should be identical to names(groupList). Values is either a character +#' for built-in similarity functions; call allowedSims() to see full list; or a custom function. #' @param makeNetFunc (function) user-defined function for creating the set #' of input PSN provided to netDx. See createPSN_MultiData()::customFunc. #' @param outDir (char) directory where results will be stored. If this @@ -169,9 +172,10 @@ #' # takes 10 minutes to run #' #out <- buildPredictor(dataList=brca,groupList=groupList, #' # makeNetFunc=makeNets, ### custom network creation function -#' # outDir=paste(tempdir(),"pred_output",sep=getFileSep()), ## absolute path +#' # outDir=paste(normalizePath(tempdir()),"pred_output",sep=getFileSep()), ## absolute path #' # numCores=16L,featScoreMax=2L, featSelCutoff=1L,numSplits=2L) -buildPredictor <- function(dataList,groupList,outDir=tempdir(),makeNetFunc, +buildPredictor <- function(dataList,groupList,outDir=normalizePath(tempdir()), + makeNetFunc=NULL,sims=NULL, featScoreMax=10L,trainProp=0.8,numSplits=10L,numCores,JavaMemory=4L, featSelCutoff=9L,keepAllData=FALSE,startAt=1L, preFilter=FALSE, impute=FALSE,preFilterGroups=NULL, imputeGroups=NULL,logging="default", @@ -196,7 +200,7 @@ if (logging == "all") { verbose_predict <- FALSE } -# Check input +# Check input - error handling if (missing(dataList)) stop("dataList must be supplied.\n") if (missing(groupList)) stop("groupList must be supplied.\n") if (length(groupList)<1) stop("groupList must be of length 1+\n") @@ -213,6 +217,9 @@ if (!is(groupList,"list") || not_list || names_nomatch ) { stop(paste(msg,sep="")) } +# checks either/or provided, sets missing var to NULL +x <- checkMakeNetFuncSims(makeNetFunc=makeNetFunc, sims=sims,groupList=groupList) + if (!is(dataList,"MultiAssayExperiment")) stop("dataList must be a MultiAssayExperiment") @@ -220,6 +227,8 @@ if (trainProp <= 0 | trainProp >= 1) stop("trainProp must be greater than 0 and less than 1") if (startAt > numSplits) stop("startAt should be between 1 and numSplits") +# end check input error handling + megaDir <- outDir if (file.exists(megaDir)) { stop(paste("outDir seems to already exist!", @@ -247,8 +256,13 @@ for (k in seq_len(length(exprs))) { tmp <- exprs[[k]] df <- sampleMap(dataList)[which(sampleMap(dataList)$assay==names(exprs)[k]),] colnames(tmp) <- df$primary[match(df$colname,colnames(tmp))] - tmp <- as.matrix(assays(tmp)[[1]]) # convert to matrix - datList2[[names(exprs)[k]]]<- tmp + if ("matrix" %in% class(tmp)) { + datList2[[names(exprs)[k]]] <- tmp + } else { + tmp <- as.matrix(assays(tmp)[[1]]) # convert to matrix + datList2[[names(exprs)[k]]]<- tmp + } + } if ("clinical" %in% names(groupList)) { tmp <- colData(dataList) @@ -274,7 +288,6 @@ if (verbose_default){ } } - outList <- list() # create master list of possible networks @@ -289,8 +302,14 @@ colnames(tmp) <- c("NetType","NetName") outList[["inputNets"]] <- tmp if (verbose_default) { - message("\n\nCustom function to generate input nets:") - print(makeNetFunc) + if (!is.null(makeNetFunc)){ + message("\n\nCustom function to generate input nets:") + print(makeNetFunc) + + } else { + message("Similarity metrics provided:") + print(sims) + } message(sprintf("-------------------------------\n")) } @@ -386,8 +405,8 @@ for (rngNum in startAt:numSplits) { if (verbose_default) message("** Creating features") createPSN_MultiData(dataList=dats_train,groupList=groupList, - pheno=pheno_id, - netDir=netDir,customFunc=makeNetFunc,numCores=numCores, + pheno=pheno_id, + netDir=netDir,makeNetFunc=makeNetFunc,sims=sims, numCores=numCores, verbose=verbose_makeFeatures) if (verbose_default) message("** Compiling features") dbDir <- compileFeatures(netDir,outDir, numCores=numCores, @@ -512,7 +531,8 @@ for (rngNum in startAt:numSplits) { pheno_id <- setupFeatureDB(pheno,netDir) createPSN_MultiData(dataList=dats_tmp,groupList=groupList, pheno=pheno_id, - netDir=netDir,customFunc=makeNetFunc,numCores=numCores, + netDir=netDir,makeNetFunc=makeNetFunc,sims=sims, + numCores=numCores, filterSet=pTally,verbose=verbose_default) dbDir <- compileFeatures(netDir,outDir=pDir,numCores=numCores, verbose=verbose_compileNets,debugMode=debugMode) diff --git a/R/buildPredictor_sparseGenetic.R b/R/buildPredictor_sparseGenetic.R index ea7b9839..0a2ff878 100644 --- a/R/buildPredictor_sparseGenetic.R +++ b/R/buildPredictor_sparseGenetic.R @@ -117,7 +117,7 @@ #' name=genes[,4]) #' #' # create GRangesList of pathways -#' pathFile <- fetchPathwayDefinitions("February",2018,verbose=TRUE) +#' pathFile <- fetchPathwayDefinitions("February",2021,verbose=TRUE) #' pathwayList <- readPathways(pathFile) #' path_GRList <- mapNamedRangesToSets(gene_GR,pathwayList) #' @@ -131,7 +131,7 @@ #' #head(out$cumulativeFeatScores) #' buildPredictor_sparseGenetic <- function(phenoDF,cnv_GR,predClass, - group_GRList,outDir=tempdir(), + group_GRList,outDir=normalizePath(tempdir()), numSplits=3L, featScoreMax=10L, filter_WtSum=100L, enrichLabels=TRUE,enrichPthresh=0.07,numPermsEnrich=2500L,minEnr=-1, diff --git a/R/compileFeatures.R b/R/compileFeatures.R index f5bb54b0..fb9ceb90 100644 --- a/R/compileFeatures.R +++ b/R/compileFeatures.R @@ -51,21 +51,21 @@ #' writeProfiles=TRUE,...) #' unlist(netList) #' } -#' tmpDir <- tempdir(); netDir <- paste(tmpDir,"nets", +#' tmpDir <- normalizePath(tempdir()); netDir <- paste(tmpDir,"nets", #' sep=getFileSep()) #' if (file.exists(netDir)) unlink(netDir,recursive=TRUE) #' dir.create(netDir,recursive=TRUE) #' #' pheno_id <- setupFeatureDB(pheno,netDir) #' netList <- createPSN_MultiData(dataList=dataList, groupList=groupList, -#' pheno=pheno_id,netDir=netDir,customFunc=makeNets,verbose=TRUE) +#' pheno=pheno_id,netDir=netDir,makeNetFunc=makeNets,verbose=TRUE) #' #' outDir <- paste(tmpDir,'dbdir',sep=getFileSep()); #' dir.create(outDir) #' dbDir <- compileFeatures(netDir,outDir) #' @import doParallel #' @export -compileFeatures <- function(netDir, outDir = tempdir(), +compileFeatures <- function(netDir, outDir = normalizePath(tempdir()), simMetric = "pearson", netSfx = "txt$", verbose = TRUE, numCores = 1L, P2N_threshType = "off", P2N_maxMissing = 100, @@ -125,14 +125,17 @@ compileFeatures <- function(netDir, outDir = tempdir(), curProf <- "" `%myinfix%` <- ifelse(debugMode, `%do%`, `%dopar%`) + foreach(curProf = dir(path = profDir, pattern = "profile$")) %myinfix% { args2 <- c("-in", paste(profDir, curProf, sep = getFileSep())) args2 <- c(args2, "-out", - paste(netOutDir, sub(".profile", ".txt", curProf), + paste(netOutDir, sub(".profile", ".txt", curProf), sep = getFileSep())) + args2 <- c(args2, "-syn", paste(netDir, "1.synonyms", sep = getFileSep()), "-keepAllTies", "-limitTies") + if (debugMode) { message("Making Java call") tmp <- paste(c(args, args2), collapse = " ") @@ -214,18 +217,19 @@ compileFeatures <- function(netDir, outDir = tempdir(), unlink(olddir) # Check: need to replace commas used as decimal separators, into periods - tmp <- dir(path = sprintf("%s/INTERACTIONS", netDir), pattern = "txt$")[1] - tmp <- sprintf("%s/INTERACTIONS/%s", netDir, tmp) + tmp <- dir(path = paste(netDir,"INTERACTIONS", sep=getFileSep()), pattern = "txt$")[1] + tmp <- sprintf(paste(netDir,"INTERACTIONS",tmp,sep=getFileSep())) if (sum(grepl(pattern = ",", readLines(tmp, n = 1)) > 0)) { # detect comma - replacePattern(path = sprintf("%s/INTERACTIONS", netDir)) + replacePattern(path = paste(netDir,"INTERACTIONS", sep=getFileSep())) } # Build GeneMANIA cache if (verbose) message("\t* Build GeneMANIA cache") - args <- c("-Xmx10G", "-cp", GM_jar, + args <- c("--illegal-access=permit", # needed for Java 9-16. Deprecated in Java 17 + "-Xmx10G", "-cp", GM_jar, "org.genemania.engine.apps.CacheBuilder") args <- c(args, "-cachedir", "cache", "-indexDir", ".", "-networkDir", diff --git a/R/convertToMAE.R b/R/convertToMAE.R new file mode 100644 index 00000000..0e81e3bc --- /dev/null +++ b/R/convertToMAE.R @@ -0,0 +1,82 @@ +#' Wrapper that converts an input list into a MultiAssayExperiment object +#' +#' @details This function takes in a list of key-value pairs (keys: data types, +#' values: matrices/dataframes) and calls the necessary functions from the +#' MultiAssayExperiment package to incorporate the values from the input list +#' into a MultiAssayExperiment object, transforming the values according to the +#' keys. If duplicate sample names are found in the assay data, only the first +#' instance is kept. +#' @param dataList (list) input key-value pairs (keys: data types, values: +#' data in the form of matrices/dataframes); must have a key-value pair that +#' corresponds to patient IDs/metadata labelled pheno. +#' @return MAE (MultiAssayExperiment) data from input list incorporated into a +#' MultiAssayExperiment object, compatible with further analysis using the +#' netDx algorithm. +#' @examples +#' data(xpr, pheno) +#' +#' # Generate random proteomic data +#' prot <- matrix(rnorm(100*20), ncol=20) +#' colnames(prot) <- sample(pheno$ID, 20) +#' rownames(prot) <- sprintf("protein%i",1:100) +#' +#' myList <- list(rna = xpr, proteomic = prot, pheno = pheno) +#' +#' MAE <- convertToMAE(myList) +#' @export +convertToMAE <- function(dataList) { + + # Check input data: + if (class(dataList) != "list") { + stop("dataList must be a list. \n") + } + if (is.null(dataList$pheno)) { + stop("dataList must have key-value pair labelled pheno.\n") + } + if (length(dataList) == 1) { + stop("dataList must have assay data to incorporate into a + MultiAssayExperiment object") + } + + # Note that a MultiAssayExperiment object requires an ExperimentList and + # colData (sampleMap optional if each assay uses the same colnames) + + # Possible elements for ExperimentList: + # - base::matrix (gene expression, microRNA, metabolomics, microbiome data) + # - SummarizedExperiment::SummarizedExperiment (same as matrix, but capable + # of storing additional assay-level metadata) + # - Biobase::ExpressionSet (legacy representation, use SummarizedExperiment) + # - SummarizedExperiment::RangedSummarizedExperiment (range-based datasets; + # gene expression, methylation, data types that refer to genomic positions) + # - RaggedExperiment::RaggedExperiment (range-based datasets; copy number and + # mutation data, measurements by genomic positions) + + # Assumes that pheno is a DataFrame (or coerceable to be a DataFrame) + patientPheno <- dataList$pheno + + # Generate ExperimentList from input dataList + tmp <- NULL + track <- c() + datType <- names(dataList) + for (k in 1:length(dataList)) { + # For key-value pairs that aren't labelled pheno, transform into + # objects compatible with input into MultiAssayExperiment object + if (names(dataList[k]) != "pheno") { + + # Remove duplicated columns (we keep the first column) in the assay data + if (sum(duplicated(colnames(dataList[[k]]))) != 0) { + dataList[[k]] <- dataList[[k]][,!duplicated(colnames(dataList[[k]]))] + } + + # Assumes that data is of matrix class + # *(maybe implement matrix conversion into SummarizedExperiment in future) + track <- c(track, k) + tmp <- c(tmp, list(dataList[[k]])) + } + } + names(tmp) <- datType[track] + + MAE <- MultiAssayExperiment(experiments = tmp, colData = patientPheno) + + return(MAE) +} \ No newline at end of file diff --git a/R/createPSN_MultiData.R b/R/createPSN_MultiData.R index ea6e976c..719497cd 100644 --- a/R/createPSN_MultiData.R +++ b/R/createPSN_MultiData.R @@ -12,11 +12,15 @@ #' with internally-generated identifiers. #' @param netDir (char) path to directory where networks will be stored #' @param filterSet (char) vector of networks to include -#' @param customFunc (function) custom user-function to create PSN. +#' @param makeNetFunc (function) custom user-function to create PSN. #' Must take dataList,groupList,netDir as parameters. Must #' check if a given groupList is empty (no networks to create) before #' the makePSN call for it. This is to avoid trying to make nets for datatypes #' that did not pass feature selection +#' @param sims (list) Similarity metric settings for patient data. +#' Keys must be identical to those of groupList. +#' Values are either of type character, used for built-in similarity functions, +#' or are functions, when a custom function is provided. #' @param verbose (logical) print messages #' @param ... other parameters to makePSN_NamedMatrix() or makePSN_RangedSets() #' @return (char) vector of network names. Side effect of creating the nets @@ -95,24 +99,24 @@ #' pheno_id <- setupFeatureDB(colData(brca),netDir) #' createPSN_MultiData(dataList=datList2,groupList=groupList, #' pheno=pheno_id, -#' netDir=netDir,customFunc=makeNets,numCores=1) +#' netDir=netDir,makeNetFunc=makeNets,numCores=1) #' @export createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(), filterSet = NULL, - verbose = TRUE, customFunc, ...) { + verbose = TRUE, makeNetFunc=NULL, sims=NULL, ...) { if (missing(dataList)) stop("dataList must be supplied.\n") if (missing(groupList)) stop("groupList must be supplied.\n") - + # resolve user-provided IDs with internal IDs dataList <- lapply(dataList, function(x) { midx <- match(colnames(x), pheno$ID) colnames(x) <- pheno$INTERNAL_ID[midx] x }) - + if (!is.null(filterSet)) { if (length(filterSet) < 1) { s1 <- "filterSet is empty." @@ -120,8 +124,8 @@ createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(), stop(paste(s1, s2, sep = " ")) } } - if (missing(customFunc)) - stop("customFunc must be suppled.\n") + + # Filter for nets (potentially feature-selected ones) if (!is.null(filterSet)) { @@ -139,12 +143,22 @@ createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(), } } groupList <- groupList2 + sims <- sims[which(names(sims) %in% names(groupList))] rm(groupList2) } + if (!is.null(makeNetFunc)){ # call user-defined function for making PSN - netList <- customFunc(dataList = dataList, groupList = groupList, + netList <- makeNetFunc(dataList = dataList, groupList = groupList, netDir = netDir, ...) + } else { + netList <- createNetFuncFromSimList(dataList=dataList, + groupList = groupList, + netDir = netDir, + sims = sims, + ... + ) + } if (length(netList) < 1) stop("\n\nNo features created! Filters may be too stringent.\n") diff --git a/R/fileCache.R b/R/fileCache.R index d56c15f9..187afb11 100644 --- a/R/fileCache.R +++ b/R/fileCache.R @@ -19,16 +19,25 @@ #' @export getGMjar_path <- function(verbose = FALSE) { - java_ver <- suppressWarnings( + java_ver <- suppressMessages(suppressWarnings( system2("java", args="--version",stdout=TRUE,stderr=NULL) - ) - if (any(grep(" 11",java_ver)) || any(grep(" 12",java_ver)) || any(grep(" 13",java_ver)) || any(grep(" 14",java_ver)) || any(grep(" 16",java_ver))) { + )) + if (any(grep(" 11", java_ver)) || + any(grep(" 12", java_ver)) || + any(grep(" 13", java_ver)) || + any(grep(" 14", java_ver)) || + any(grep(" 16", java_ver)) || + any(grep(" 17", java_ver)) || + any(grep(" 18", java_ver)) || + any(grep(" 19", java_ver)) || + any(grep(" 20", java_ver)) + ) { if (verbose) message("Java 11+ detected") - fileURL <- paste("https://download.baderlab.org/netDx/java11/", + fileURL <- paste("https://downloads.res.oicr.on.ca/pailab/netDx/java11/", "genemania-netdx.jar",sep="") } else { if (verbose) message("Java 8 detected") - fileURL <- paste("https://download.baderlab.org/netDx/java8/", + fileURL <- paste("https://downloads.res.oicr.on.ca/pailab/netDx/java8/", "genemania-netdx.jar",sep="") } @@ -45,30 +54,34 @@ getGMjar_path <- function(verbose = FALSE) { #' For details see Merico D, Isserlin R, Stueker O, Emili A and GD Bader. #' (2010). PLoS One. 5(11):e13984. #' @param verbose (logical) print messages -#' @examples fetchPathwayDefinitions("October",2020) +#' @examples fetchPathwayDefinitions("October",2021) #' @param day (integer) #' @param month (numeric or char) month of pathway definition file. Can be #' numeric or text (e.g. "January","April"). If NULL, fails. #' @param year (numeric) year of pathway definition file. Must be in -#' yyyy format (e.g. 2018). If NULL, fails. +#' yyyy format (e.g. 2020). If NULL, fails. #' @return (char) Path to local cached copy of GMT file #' or initial download is required #' @importFrom httr HEAD #' @export #' @examples -#' fetchPathwayDefinitions("January",2018) -#' fetchPathwayDefinitions(month=10,year=2020) +#' fetchPathwayDefinitions("October",2021) +#' fetchPathwayDefinitions(month=10,year=2021) fetchPathwayDefinitions <- function(month=NULL,year=NULL,day=1,verbose=FALSE){ if (is.null(month) || is.null(year)) { stop("Please provide a month and year.") #month <- month.name[as.integer(format(Sys.Date(),"%m"))] #year <- as.integer(format(Sys.Date(),"%Y")) } + if (year < 2020) { + stop("Currently, year must equal 2020 or greater.") + } + if (class(month) %in% c("numeric","integer")) { month <- month.name[month] } pdate <- sprintf("%s_%02d_%i",month,day,year) - pathwayURL <- paste("https://download.baderlab.org/EM_Genesets/", + pathwayURL <- paste("https://downloads.res.oicr.on.ca/pailab/public/EM_Genesets/", sprintf("%s/Human/symbol/",pdate), sprintf("Human_AllPathways_%s_symbol.gmt",pdate), sep = "") @@ -79,7 +92,7 @@ fetchPathwayDefinitions <- function(month=NULL,year=NULL,day=1,verbose=FALSE){ if (chk$status_code==404) { stop(paste(sprintf("The pathway file for %02d %s %i doesn't exist.",day,month,year), "Select a different date. ", - "See https://download.baderlab.org/EM_Genesets/Human/symbol for options.", + "See https://downloads.res.oicr.on.ca/pailab/public/EM_Genesets/Human/symbol for options.", sep=" ")) } bfcrpath(bfc, pathwayURL) diff --git a/R/getSimilarity.R b/R/getSimilarity.R index b104b34c..b2c8f941 100644 --- a/R/getSimilarity.R +++ b/R/getSimilarity.R @@ -18,6 +18,9 @@ #' @importFrom stats cor #' @export getSimilarity <- function(x, type = "pearson", customFunc, ...) { - switch(type, pearson = round(cor(na.omit(x), method = "pearson"), - digits = 3), custom = customFunc(x, ...)) + switch(type, + pearson = round(cor(na.omit(x), method = "pearson"), + digits = 3), + custom = customFunc(x, ...) + ) } diff --git a/R/helper.R b/R/helper.R index e5e0bdec..0faa4697 100755 --- a/R/helper.R +++ b/R/helper.R @@ -12,6 +12,7 @@ #' @param featureSelPct (numeric between 0 and 1) cutoff percent for feature selection. #' A feature must have minimum score of featureSelCutoff for featureSelPct of #' train/test splits to pass. +#' @param drawPerformancePlot (logical) if TRUE, draws AUROC and AUPR plots. Set to FALSE to suppress graphical output. #' @returns list of results. #' - selectedFeatures (list of character vectors): list, one per class #' - performance (list of mixed datatypes) including mean accuracy (meanAccuracy), @@ -24,65 +25,67 @@ #' getResults(toymodel,patlabels,2,0.5) #' #' @export -getResults <- function(res, status, featureSelCutoff=1L, - featureSelPct=0){ - -numSplits <- length(grep("^Split",names(res))) -st <- status -message(sprintf("Detected %i splits and %i classes", numSplits, length(st))) - -acc <- c() # accuracy -predList <- list() # prediction tables -featScores <- list() # feature scores per class -for (cur in unique(st)) featScores[[cur]] <- list() - -# collect accuracy and feature scores -for (k in 1:numSplits) { - pred <- res[[sprintf("Split%i",k)]][["predictions"]]; - # predictions table - tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS", - sprintf("%s_SCORE",st))] - predList[[k]] <- tmp - # accuracy - acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp)) - # feature scores - for (cur in unique(st)) { - tmp <- res[[sprintf("Split%i",k)]][["featureScores"]][[cur]] - colnames(tmp) <- c("PATHWAY_NAME","SCORE") - featScores[[cur]][[sprintf("Split%i",k)]] <- tmp - } -} +getResults <- function(res, status, featureSelCutoff = 1L, + featureSelPct = 0, drawPerformancePlot = TRUE) { -# only plot ROC and PR curves -auroc <- NULL; aupr <- NULL -if (length(st)==2) { -message("* Plotting performance") -predPerf <- plotPerf(predList, predClasses=st) -auroc <- unlist(lapply(predPerf, function(x) x$auroc)) -aupr <- unlist(lapply(predPerf, function(x) x$aupr)) -} + numSplits <- length(grep("^Split", names(res))) + st <- status + message(sprintf("Detected %i splits and %i classes", numSplits, length(st))) -message("* Compiling feature scores and calling selected features") -feats <- callOverallSelectedFeatures(featScores, + acc <- c() # accuracy + predList <- list() # prediction tables + featScores <- list() # feature scores per class + for (cur in unique(st)) + featScores[[cur]] <- list() + + # collect accuracy and feature scores + for (k in 1:numSplits) { + pred <- res[[sprintf("Split%i", k)]][["predictions"]]; + # predictions table + tmp <- pred[, c("ID", "STATUS", "TT_STATUS", "PRED_CLASS", + sprintf("%s_SCORE", st))] + predList[[k]] <- tmp + # accuracy + acc <- c(acc, sum(tmp$PRED == tmp$STATUS) / nrow(tmp)) + # feature scores + for (cur in unique(st)) { + tmp <- res[[sprintf("Split%i", k)]][["featureScores"]][[cur]] + colnames(tmp) <- c("PATHWAY_NAME", "SCORE") + featScores[[cur]][[sprintf("Split%i", k)]] <- tmp + } + } + + # only plot ROC and PR curves + auroc <- NULL; + aupr <- NULL + if (length(st) == 2) { + if (drawPerformancePlot) message("* Plotting performance") + predPerf <- plotPerf(predList, predClasses = st, drawPlot = drawPerformancePlot) + auroc <- unlist(lapply(predPerf, function(x) x$auroc)) + aupr <- unlist(lapply(predPerf, function(x) x$aupr)) + } + + message("* Compiling feature scores and calling selected features") + feats <- callOverallSelectedFeatures(featScores, featureSelCutoff = featureSelCutoff, featureSelPct = featureSelPct, cleanNames = TRUE -) - -#### Enrichment map -###if (!is.null(pathwayList)){ -### message("* Pathway List detected - creating input for EnrichmentMap") -### browser() -###} - -return(list( - selectedFeatures=feats$selectedFeatures, - featureScores=feats$featScores, - performance=list(meanAccuracy=mean(acc), - splitAccuracy=acc, - splitAUROC=auroc, - splitAUPR=aupr) -)) + ) + + #### Enrichment map + ###if (!is.null(pathwayList)){ + ### message("* Pathway List detected - creating input for EnrichmentMap") + ### browser() + ###} + + return(list( + selectedFeatures = feats$selectedFeatures, + featureScores = feats$featScores, + performance = list(meanAccuracy = mean(acc), + splitAccuracy = acc, + splitAUROC = auroc, + splitAUPR = aupr) + )) } @@ -125,30 +128,32 @@ return(list( #' featScores: (matrix) feature scores for each split #' selectedFeatures: (list) features passing selection for each class; one key per class #' @export -callOverallSelectedFeatures <- function(featScores, featureSelCutoff, - featureSelPct, cleanNames=TRUE){ -featScores2 <- lapply(featScores, getNetConsensus) -if (cleanNames) { - featScores2 <- lapply(featScores2,function(x){ - x$PATHWAY_NAME <- sub(".profile","",x$PATHWAY_NAME) - x$PATHWAY_NAME <- sub("_cont.txt","",x$PATHWAY_NAME) - colnames(x)[1] <- "Feature" - x +callOverallSelectedFeatures <- function(featScores, featureSelCutoff, + featureSelPct, cleanNames = TRUE) { + featScores2 <- lapply(featScores, getNetConsensus) + if (cleanNames) { + featScores2 <- lapply(featScores2, function(x) { + x$PATHWAY_NAME <- sub(".profile", "", x$PATHWAY_NAME) + x$PATHWAY_NAME <- sub("_cont.txt", "", x$PATHWAY_NAME) + colnames(x)[1] <- "Feature" + x }) -} -featSelNet <- lapply(featScores2, function(x) { - x <- callFeatSel(x, fsCutoff=featureSelCutoff, fsPctPass=featureSelPct) -}) - -return(list( - featScores=featScores2, - selectedFeatures=featSelNet -)) + } + featSelNet <- lapply(featScores2, function(x) { + x <- callFeatSel(x, fsCutoff = featureSelCutoff, fsPctPass = featureSelPct) + }) + + return(list( + featScores = featScores2, + selectedFeatures = featSelNet + )) } #' Wrapper to create input files for Enrichment Map #' -#' @details An Enrichment Map is a network-based visualization of top-scoring pathway features +#' @details Creates the input to visualize selected features and their relationships +#' as a network in Cytoscape. The type of visualization is called an Enrichment Map. +#' An Enrichment Map is a network-based visualization of top-scoring pathway features #' and themes. It is generated in Cytoscape. This script generates the input files needed #' for Cytoscape to create an Enrichment Map visualization. #' @param model (list) Output of training model, generated by running buildPredictor() @@ -159,49 +164,51 @@ return(list( #' @param EMapPctPass (numeric between 0 and 1) percent of splits for which feature must have score in range #' [EMapMinScore,EMapMaxScore] to be included for EnrichmentMap visualization #' @param outDir (char) directory where files should be written -#' @return -#' @export -makeInputForEnrichmentMap <- function(model,results,pathwayList, - EMapMinScore=0L, EMapMaxScore=1L, - EMapPctPass=0.5,outDir) -{ - featScores <- results$featureScores - -message("* Creating input files for EnrichmentMap") -Emap_res <- getEMapInput_many(featScores, +#' @return (list) 1) GMTfiles (char): GMT files used to create EnrichmentMap in Cytoscape. +#' 2) NodeStyles (char): .txt files used to assign node attributes in Cytoscape. Importantly, +#' attributes include node fill, which indicates the highest consistent score for a given +#' feature. +#' @export +createInputForFeatureNetworkView <- function(model, results, pathwayList, + EMapMinScore = 0L, EMapMaxScore = 1L, + EMapPctPass = 0.5, outDir) { + featScores <- results$featureScores + + message("* Creating input files for feature network view") + Emap_res <- getEMapInput_many(featScores, pathwayList, - minScore=EMapMinScore, - maxScore=EMapMaxScore, - pctPass=EMapPctPass, + minScore = EMapMinScore, + maxScore = EMapMaxScore, + pctPass = EMapPctPass, model$inputNets, - verbose=FALSE -) + verbose = FALSE + ) -gmtFiles <- list() -nodeAttrFiles <- list() + gmtFiles <- list() + nodeAttrFiles <- list() -message("* Writing files for network visualization") -for (g in names(Emap_res)) { - outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep()) - write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile, - sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE) + message("* Writing files for network visualization") + for (g in names(Emap_res)) { + outFile <- paste(outDir, sprintf("%s_nodeAttrs.txt", g), sep = getFileSep()) + write.table(Emap_res[[g]][["nodeAttrs"]], file = outFile, + sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) nodeAttrFiles[[g]] <- outFile - outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep()) + outFile <- paste(outDir, sprintf("%s.gmt", g), sep = getFileSep()) conn <- suppressWarnings( - suppressMessages(base::file(outFile,"w"))) + suppressMessages(base::file(outFile, "w"))) tmp <- Emap_res[[g]][["featureSets"]] gmtFiles[[g]] <- outFile for (cur in names(tmp)) { - curr <- sprintf("%s\t%s\t%s", cur,cur, - paste(tmp[[cur]],collapse="\t")) - writeLines(curr,con=conn) + curr <- sprintf("%s\t%s\t%s", cur, cur, + paste(tmp[[cur]], collapse = "\t")) + writeLines(curr, con = conn) } -close(conn) -} + close(conn) + } -return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles)) + return(list(GMTfiles = gmtFiles, NodeStyles = nodeAttrFiles)) } #' get the integrated patient similarity network made of selected features @@ -214,8 +221,10 @@ return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles)) #' same class, relative to those of other classes, using Dijkstra distance (calcShortestPath flag). #' @param dat (MultiAssayExperiment) input data #' @param groupList (list) feature groups, identical to groupList provided for buildPredictor() -#' @param makeNets (function) Function used to create patient similarity networks. Identical to +#' @param makeNetFunc (function) Function used to create patient similarity networks. Identical to #' makeNets provided to buildPredictor() +#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc. See buildPredictor() +#' for details. #' @param selectedFeatures (list) selected features for each class (key of list). This object is returned as #' part of a call to getResults(), after running buildPredictor(). #' @param plotCytoscape (logical) If TRUE, plots network in Cytoscape. @@ -245,35 +254,50 @@ return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles)) #' colours (colour) #' 6) outDir (char) value of outDir parameter #' @export -getPSN <- function(dat, groupList, makeNets, selectedFeatures, plotCytoscape=FALSE, - aggFun="MEAN", prune_pctX=0.30, prune_useTop=TRUE,numCores=1L,calcShortestPath=FALSE - ){ -topPath <- gsub(".profile","", unique(unlist(selectedFeatures))) -topPath <- gsub("_cont.txt","",topPath) - -## create groupList limited to top features -g2 <- list(); -for (nm in names(groupList)) { - cur <- groupList[[nm]] - idx <- which(names(cur) %in% topPath) - message(sprintf("%s: %i features", nm, length(idx))) - if (length(idx)>0) g2[[nm]] <- cur[idx] -} +getPSN <- function(dat, groupList, + makeNetFunc = NULL, sims = NULL, + selectedFeatures, plotCytoscape = FALSE, + aggFun = "MEAN", prune_pctX = 0.30, prune_useTop = TRUE, + numCores = 1L, calcShortestPath = FALSE + ) { -message("* Making integrated PSN") -psn <- - plotIntegratedPatientNetwork(dat, - groupList=g2, makeNetFunc=makeNets, - aggFun=aggFun, - prune_pctX=prune_pctX, - prune_useTop=prune_useTop, - numCores=numCores, - calcShortestPath=calcShortestPath, - showStats=FALSE, - verbose=TRUE, - plotCytoscape=plotCytoscape) - -return(psn) + + # checks either/or provided, sets missing var to NULL + x <- checkMakeNetFuncSims(makeNetFunc = makeNetFunc, + sims = sims, groupList = groupList) + + topPath <- gsub(".profile", "", unique(unlist(selectedFeatures))) + topPath <- gsub("_cont.txt", "", topPath) + + ## create groupList limited to top features + g2 <- list() + s2 <- list() + for (nm in names(groupList)) { + cur <- groupList[[nm]] + idx <- which(names(cur) %in% topPath) + message(sprintf("%s: %i features", nm, length(idx))) + if (length(idx) > 0) { + g2[[nm]] <- cur[idx] + s2[[nm]] <- sims[[nm]] + } + } + + message("* Making integrated PSN") + psn <- + plotIntegratedPatientNetwork( + dataList = dat, + groupList = g2, makeNetFunc = makeNetFunc, + sims = s2, + aggFun = aggFun, + prune_pctX = prune_pctX, + prune_useTop = prune_useTop, + numCores = numCores, + calcShortestPath = calcShortestPath, + showStats = FALSE, + verbose = TRUE, + plotCytoscape = plotCytoscape) + + return(psn) } #' Make confusion matrix @@ -291,37 +315,38 @@ return(psn) #' @importFrom plotrix color2D.matplot #' @export confusionMatrix <- function(model) { - nmList <- names(model)[grep("Split",names(model))] - cl <- sort(unique(model$Split1$STATUS)) - conf <- list() - mega <- NULL - for (nm in nmList){ - pred <- model[[nm]][["predictions"]][,c("ID","STATUS","TT_STATUS","PRED_CLASS")] - m <- as.matrix(table(pred[,c("STATUS","PRED_CLASS")])) - conf[[nm]] <- m/colSums(m) - if (is.null(mega)) mega <- conf[[nm]] else mega <- mega + conf[[nm]] - } - - mega <- mega / length(conf) # average - mega <- round(mega*100,2) - mega <- t(mega) - metric <- "%% Accuracy" - - tbl <- table(model$Split1$predictions$STATUS) - nm <- names(tbl); val <- as.integer(tbl) - ttl <- sprintf("%s\n(N=%i)",rownames(mega),val[match(rownames(mega),nm)]) - - par(mar=c(4,8,2,2)) - color2D.matplot(mega,show.values=TRUE, border="white", - #cs1=c(1,1,1),cs2=c(1,0.5,0),cs3=c(0,0.5,0), - extremes=c(1,2), - axes=FALSE, - xlab="Predicted class",ylab="") - axis(1,at=seq_len(ncol(mega))-0.5,labels=colnames(mega)) - axis(2,at=seq_len(ncol(mega))-0.5,labels=rev(ttl),las=2) - title(sprintf("Confusion matrix: Accuracy (avg of %i splits)",length(conf))) - - return(list(splitWiseConfMatrix=conf, average=mega)) + nmList <- names(model)[grep("Split", names(model))] + cl <- sort(unique(model$Split1$STATUS)) + conf <- list() + mega <- NULL + for (nm in nmList) { + pred <- model[[nm]][["predictions"]][, c("ID", "STATUS", "TT_STATUS", "PRED_CLASS")] + m <- as.matrix(table(pred[, c("STATUS", "PRED_CLASS")])) + conf[[nm]] <- m / colSums(m) + if (is.null(mega)) mega <- conf[[nm]] else mega <- mega + conf[[nm]] + } + + mega <- mega / length(conf) # average + mega <- round(mega * 100, 2) + mega <- t(mega) + metric <- "%% Accuracy" + + tbl <- table(model$Split1$predictions$STATUS) + nm <- names(tbl) + val <- as.integer(tbl) + ttl <- sprintf("%s\n(N=%i)", rownames(mega), val[match(rownames(mega), nm)]) + + par(mar = c(4, 8, 2, 2)) + color2D.matplot(mega, show.values = TRUE, border = "white", + #cs1=c(1,1,1),cs2=c(1,0.5,0),cs3=c(0,0.5,0), + extremes = c(1, 2), + axes = FALSE, + xlab = "Predicted class", ylab = "") + axis(1, at = seq_len(ncol(mega)) - 0.5, labels = colnames(mega)) + axis(2, at = seq_len(ncol(mega)) - 0.5, labels = rev(ttl), las = 2) + title(sprintf("Confusion matrix: Accuracy (avg of %i splits)", length(conf))) + + return(list(splitWiseConfMatrix = conf, average = mega)) } #' Plot tSNE @@ -331,7 +356,7 @@ confusionMatrix <- function(model) { #' matrix (symmetric). Row and column names are patient IDs. Note that NA #' values will be replaced by very small number (effectively zero). #' @param pheno (data.frame) Patient labels. ID column is patient ID and -#' STATUS is patient label of interest. tSNE will colour-code nodes by +#' STATUS is patient label xof interest. tSNE will colour-code nodes by #' patient label. #' @param ... Parameters for Rtsne() function. #' @return (Rtsne) output of Rtsne call. Side effect of tSNE plot @@ -348,31 +373,31 @@ confusionMatrix <- function(model) { #' pheno <- data.frame(ID=pid,STATUS=c(rep("control",50),rep("case",50))) #' tSNEPlotter(psn2,pheno) #' @export -tSNEPlotter <- function(psn,pheno,...) { - -message("* Making symmetric matrix") -symmForm <- suppressMessages(makeSymmetric(psn)) -symmForm[which(is.na(symmForm))] <- .Machine$double.eps -message("* Running tSNE") -x <- Rtsne(symmForm,...) -dat <- x$Y -samps <- rownames(symmForm) -idx <- match(samps, pheno$ID) -if (all.equal(pheno$ID[idx],samps)!=TRUE) { - stop("pheno IDs not matching psn rownames") -} -st <- pheno$STATUS[idx] +tSNEPlotter <- function(psn, pheno, ...) { + + message("* Making symmetric matrix") + symmForm <- suppressMessages(makeSymmetric(psn)) + symmForm[which(is.na(symmForm))] <- .Machine$double.eps + message("* Running tSNE") + x <- Rtsne(symmForm, ...) + dat <- x$Y + samps <- rownames(symmForm) + idx <- match(samps, pheno$ID) + if (all.equal(pheno$ID[idx], samps) != TRUE) { + stop("pheno IDs not matching psn rownames") + } + st <- pheno$STATUS[idx] -# to eliminate the "no visible binding for global variable" problem -y <- status <- NULL + # to eliminate the "no visible binding for global variable" problem + y <- status <- NULL -message("* Plotting") -colnames(dat) <- c("x","y") -dat <- as.data.frame(dat,stringsAsFactors=TRUE) -dat$status <- as.factor(st) -p <- ggplot2::ggplot(dat,aes(x,y)) + geom_point(aes(colour=status)) -p <- p + xlab("") + ylab("") + ggtitle("Integrated PSN - tSNE") -print(p) + message("* Plotting") + colnames(dat) <- c("x", "y") + dat <- as.data.frame(dat, stringsAsFactors = TRUE) + dat$status <- as.factor(st) + p <- ggplot2::ggplot(dat, aes(x, y)) + geom_point(aes(colour = status)) + p <- p + xlab("") + ylab("") + ggtitle("Integrated PSN - tSNE") + print(p) -return(x) + return(x) } \ No newline at end of file diff --git a/R/makePSN_NamedMatrix.R b/R/makePSN_NamedMatrix.R index 97711c37..18fbe832 100644 --- a/R/makePSN_NamedMatrix.R +++ b/R/makePSN_NamedMatrix.R @@ -101,28 +101,31 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir = tempdir(), if (verbose) message(sprintf("%i members", length(idx))) - oFile <- NULL + oFile <- NULL + # has sufficient connections to make network if (length(idx) >= minMembers) { if (writeProfiles) { - outFile <- paste(outDir,sprintf("%s.profile",curSet), - sep=getFileSep()) - write.table(t(xpr[idx, , drop = FALSE]), file = outFile, - sep = "\t",dec=".", - col.names = FALSE, row.names = TRUE, quote = FALSE) + outFile <- paste(outDir, sprintf("%s.profile", curSet), + sep=getFileSep()) + write.table(t(xpr[idx,, drop = FALSE]), file = outFile, + sep = "\t", dec = ".", + col.names = FALSE, row.names = TRUE, quote = FALSE) + } else { - outFile <- paste(outDir,sprintf("%s_cont.txt", curSet), - sep=getFileSep()) + outFile <- paste(outDir, sprintf("%s_cont.txt", curSet), + sep=getFileSep()) message(sprintf("computing sim for %s", curSet)) - sim <- getSimilarity(xpr[idx, , drop = FALSE], - type = simMetric, - ...) + sim <- getSimilarity(xpr[idx,, drop = FALSE], + type = simMetric, + ...) if (is.null(sim)) { - stop(sprintf(paste("makePSN_NamedMatrix:%s: ", - "similarity matrix is empty (NULL).\n", - "Check that there isn't a mistake in the ", - "input data or similarity method of choice.\n", - sep = ""), curSet)) + stop(sprintf(paste("makePSN_NamedMatrix:%s: ", + "similarity matrix is empty (NULL).\n", + "Check that there isn't a mistake in the ", + "input data or similarity method of choice.\n", + sep = ""), + curSet)) } pat_pairs <- sim @@ -149,8 +152,8 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir = tempdir(), }) } } else { - write.table(pat_pairs, file = outFile, sep = "\t", - col.names = FALSE, + write.table(pat_pairs, file = outFile, sep = "\t", + col.names = FALSE, row.names = FALSE, quote = FALSE) print(basename(outFile)) message("done") diff --git a/R/plotEmap.R b/R/plotEmap.R index 2e1ff6bc..450a68f8 100644 --- a/R/plotEmap.R +++ b/R/plotEmap.R @@ -55,98 +55,109 @@ #' nodeAttrFile <- EMap_input[[1]][2] #' #' # not run because requires Cytoscape to be installed and open -#' # plotEmap(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, +#' # viewSelectedFeaturesAsNetworks(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, #' #\t\tnetName='HighRisk') -#' @return No value. Side effect of plotting the EnrichmentMap in an open +#' @return No value. Side effect of plotting the network view for features in an open #' session of Cytoscape. #' @export -plotEmap <- function(gmtFile, nodeAttrFile, netName = "generic", - scoreCol="maxScore", - minScore = 1, maxScore = 10, nodeFillStops=c(7,9), - colorScheme = "cont_heatmap", imageFormat = "png", verbose = FALSE, - createStyle = TRUE, - groupClusters = FALSE, hideNodeLabels=FALSE) { +viewSelectedFeaturesAsNetworks <- function(gmtFile, nodeAttrFile, netName = "generic", + scoreCol = "maxScore", + minScore = 1, maxScore = 10, nodeFillStops = c(7, 9), + colorScheme = "cont_heatmap", imageFormat = "png", verbose = FALSE, + createStyle = TRUE, + groupClusters = FALSE, hideNodeLabels = FALSE) { - if (!requireNamespace("RCy3",quietly=TRUE)) { - stop("Package \"RCy3\" needed for plotEmap() to work. Please install it and then make your call.", - call.=FALSE) - } - - validColSchemes <- c("cont_heatmap", "netDx_ms") - if (!colorScheme %in% validColSchemes) { - stop(sprintf("colorScheme should be one of { %s }\n", - paste(validColSchemes, + if (!requireNamespace("RCy3", quietly = TRUE)) { + stop("Package \"RCy3\" needed for plotEmap() to work. Please install it and then make your call.", + call. = FALSE) + } + + tryCatch({ + RCy3::cytoscapePing() + }, error = function(ex) { + stop("Error while trying to ping Cytoscape. Are you sure you have Cytoscape installed and currently running?") + }, finally = { + }) + + validColSchemes <- c("cont_heatmap", "netDx_ms") + if (!colorScheme %in% validColSchemes) { + stop(sprintf("colorScheme should be one of { %s }\n", + paste(validColSchemes, collapse = ","))) - } - - ####################################### create EM using given parameters - if (netName %in% getNetworkList()) { - RCy3::deleteNetwork(netName) - } - em_command <- paste("enrichmentmap build analysisType=\"generic\"", - "gmtFile=", gmtFile, "pvalue=", 1, "qvalue=", 1, - "similaritycutoff=", 0.05, "coefficients=", "JACCARD") - response <- RCy3::commandsGET(em_command) - renameNetwork(netName, getNetworkSuid()) - - ### #annotate the network using AutoAnnotate app - aa_command <- paste("autoannotate annotate-clusterBoosted", - "clusterAlgorithm=MCL", + } + + ####################################### create EM using given parameters + if (netName %in% RCy3::getNetworkList()) { + RCy3::deleteNetwork(netName) + } + + em_command <- paste("enrichmentmap build analysisType=\"generic\"", + "gmtFile=", gmtFile, "pvalue=", 1, "qvalue=", 1, + "similaritycutoff=", 0.05, "coefficients=", "JACCARD") + response <- RCy3::commandsGET(em_command) + RCy3::renameNetwork(netName, RCy3::getNetworkSuid()) + + ### #annotate the network using AutoAnnotate app + aa_command <- paste("autoannotate annotate-clusterBoosted", + "clusterAlgorithm=MCL", "labelColumn=name", "maxWords=3", "network=", netName) - print(aa_command) - response <- RCy3::commandsGET(aa_command) - - message("* Importing node attributes\n") - table_command <- sprintf(paste("table import file file=%s ", - "keyColumnIndex=1 ", - "firstRowAsColumnNames=true startLoadRow=1 TargetNetworkList=%s ", - "WhereImportTable=To%%20selected%%20networks%%20only", - sep = " "), nodeAttrFile, netName) - response <- RCy3::commandsGET(table_command) - - # apply style - message("* Creating or applying style\n") - all_unique_scores_int <- sort(unique(read.delim(nodeAttrFile)[, 2])) - all_unique_scores <- unlist(lapply(all_unique_scores_int, toString)) - styleName <- "EMapStyle" - - # define colourmap - scoreVals <- minScore:maxScore - style_cols <- "" - if (colorScheme == "cont_heatmap") { - colfunc <- colorRampPalette(c("yellow", "red")) - gradient_cols <- colfunc(length(scoreVals)) - style_cols <- colfunc(length(scoreVals)) - } else if (colorScheme == "netDx_ms") { - style_cols <- rep("white", length(scoreVals)) - style_cols[which(scoreVals >= nodeFillStops[1])] <- "orange" - style_cols[which(scoreVals >= nodeFillStops[2])] <- "red" - } - nodeLabels <- RCy3::mapVisualProperty("node label", "name", "p") - nodeFills <- RCy3::mapVisualProperty("node fill color", scoreCol, "d", - scoreVals, style_cols) - defaults <- list(NODE_SHAPE = "ellipse", NODE_SIZE = 30, - EDGE_TRANSPARENCY = 200, - NODE_TRANSPARENCY = 255, EDGE_STROKE_UNSELECTED_PAINT = "#999999") - if (createStyle) { - message("Making style\n") - RCy3::createVisualStyle(styleName, defaults, list(nodeLabels, nodeFills)) - } - RCy3::setVisualStyle(styleName) - if (groupClusters) { - RCy3::layoutNetwork("attributes-layout NodeAttribute=__mclCLuster") - redraw_command <- sprintf("autoannotate redraw network=%s", - RCy3::getNetworkSuid()) - response <- RCy3::commandsGET(redraw_command) - RCy3::fitContent() - - redraw_command <- sprintf("autoannotate redraw network=%s", - RCy3::getNetworkSuid()) - response <- RCy3::commandsGET(redraw_command) - RCy3::fitContent() - } + #print(aa_command) + response <- RCy3::commandsGET(aa_command) + + message("* Importing node attributes\n") + attrs <- read.delim(nodeAttrFile, header = T, as.is = T) + RCy3::loadTableData(attrs, data.key.column = "netName") + + # apply style + message("* Creating or applying style\n") + all_unique_scores_int <- sort(unique(read.delim(nodeAttrFile)[, 2])) + all_unique_scores <- unlist(lapply(all_unique_scores_int, toString)) + styleName <- "EMapStyle" + + # define colourmap + scoreVals <- minScore:maxScore + style_cols <- "" + if (colorScheme == "cont_heatmap") { + colfunc <- colorRampPalette(c("yellow", "red")) + gradient_cols <- colfunc(length(scoreVals)) + style_cols <- colfunc(length(scoreVals)) + } else if (colorScheme == "netDx_ms") { + style_cols <- rep("white", length(scoreVals)) + style_cols[which(scoreVals >= nodeFillStops[1])] <- "orange" + style_cols[which(scoreVals >= nodeFillStops[2])] <- "red" + } + + nodeLabels <- RCy3::mapVisualProperty("node label", "name", "p") + nodeFills <- RCy3::mapVisualProperty("node fill color", scoreCol, "d", + scoreVals, style_cols) + defaults <- list( + "node shape" = "ellipse", + "node size" = 30, + "edge transparency" = 200, + "node transparency" = 255, + "edge stroke unselected paint" = "#999999" + ) + if (createStyle) { + message("\tCreating style\n") + RCy3::createVisualStyle(styleName, defaults, list(nodeLabels, nodeFills)) + } + RCy3::setVisualStyle(styleName) + if (groupClusters) { + RCy3::layoutNetwork("attributes-layout NodeAttribute=__mclCLuster") + redraw_command <- sprintf("autoannotate redraw network=%s", + RCy3::getNetworkSuid()) + response <- RCy3::commandsGET(redraw_command) + RCy3::fitContent() + + redraw_command <- sprintf("autoannotate redraw network=%s", + RCy3::getNetworkSuid()) + response <- RCy3::commandsGET(redraw_command) + RCy3::fitContent() + RCy3::fitContent() + } - if (hideNodeLabels) { - RCy3::setNodeFontSizeDefault(0,styleName) - } + if (hideNodeLabels) { + RCy3::setNodeFontSizeDefault(0, styleName) + RCy3::fitContent() + } } diff --git a/R/plotIntegratedPatientNetwork.R b/R/plotIntegratedPatientNetwork.R index cd9ed4c4..4cb86750 100644 --- a/R/plotIntegratedPatientNetwork.R +++ b/R/plotIntegratedPatientNetwork.R @@ -19,6 +19,7 @@ #' list of lists, where the outer list corresponds to assay (e.g. mRNA, #' clinical) and inner list to features to generate from that datatype. #' @param makeNetFunc (function) function to create features +#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc #' @param setName (char) name to assign the network in Cytoscape #' @param numCores (integer) number of cores for parallel processing #' @param prune_pctX (numeric between 0 and 1) fraction of most/least @@ -59,7 +60,8 @@ #' @importFrom RColorBrewer brewer.pal #' @importFrom stats wilcox.test qexp density #' @export -plotIntegratedPatientNetwork <- function(dataList,groupList,makeNetFunc, +plotIntegratedPatientNetwork <- function(dataList,groupList, + makeNetFunc=NULL,sims=NULL, setName="predictor",prune_pctX=0.05, prune_useTop=TRUE, aggFun="MAX",calcShortestPath=FALSE, showStats=FALSE, @@ -67,6 +69,11 @@ plotIntegratedPatientNetwork <- function(dataList,groupList,makeNetFunc, nodeTransparency=155L,plotCytoscape=FALSE, verbose=FALSE) { + +# checks either/or provided, sets missing var to NULL +checkMakeNetFuncSims(makeNetFunc=makeNetFunc, + sims=sims,groupList=groupList) + if (missing(dataList)) stop("dataList is missing.") dat <- dataList2List(dataList, groupList) @@ -81,7 +88,9 @@ pheno_id <- setupFeatureDB(pheno,outDir) createPSN_MultiData(dataList=dat$assays,groupList=groupList, pheno=pheno_id, - netDir=outDir,customFunc=makeNetFunc,numCores=numCores, + netDir=outDir, + makeNetFunc=makeNetFunc,sims=sims, + numCores=numCores, verbose=FALSE) convertProfileToNetworks( netDir=profDir, diff --git a/R/plotPerf.R b/R/plotPerf.R index 632c1d60..3f935022 100644 --- a/R/plotPerf.R +++ b/R/plotPerf.R @@ -11,6 +11,7 @@ #' @param predClasses (char) vector of class names. #' @param plotSEM (logical) metric for error bars. If set to TRUE, plots SEM; #' else plots SD. +#' @param drawPlot (logical) If TRUE, draws AUROC and AUPR curves. #' @return (list) each key corresponds to an input file in inDir. #' Value is a list with: #' 1) stats: 'stats' component of perfCalc @@ -39,115 +40,120 @@ #' @importFrom stats sd #' @importFrom graphics abline axis par points segments text title hist #' @export -plotPerf <- function(resList=NULL, inFiles, predClasses,plotSEM=FALSE) { - if (is.null(resList)) { - if (missing(inFiles)) - stop("inDir not provided") - } - if (missing(predClasses)) - stop("predClasses missing; please specify classes") - - # given output of performance('precall') compute AUC-PR - prauc <- function(dat) { - x <- dat@x.values[[1]] # recall - y <- dat@y.values[[1]] # precision - - # remove NAN - idx <- which(is.nan(y)) - if (any(idx)) { - x <- x[-idx] - y <- y[-idx] - } - - pracma::trapz(x, y) +plotPerf <- function(resList = NULL, inFiles, predClasses, plotSEM = FALSE, drawPlot = TRUE) { + if (is.null(resList)) { + if (missing(inFiles)) + stop("inDir not provided") + } + if (missing(predClasses)) + stop("predClasses missing; please specify classes") + + # given output of performance('precall') compute AUC-PR + prauc <- function(dat) { + x <- dat@x.values[[1]] # recall + y <- dat@y.values[[1]] # precision + + # remove NAN + idx <- which(is.nan(y)) + if (any(idx)) { + x <- x[-idx] + y <- y[-idx] } - - if (is.null(resList)) { - resList <- list(); ctr <- 1 - for (fName in inFiles) { - resList[[ctr]] <- read.delim(fName, - sep = "\t", header = TRUE, as.is = TRUE) - ctr <- ctr+1 - } - } - - mega <- list() - for (ctr in seq_len(length(resList))) { - dat <- resList[[ctr]] - out <- list() - overall_acc <- numeric() - curRoc <- list() - curPr <- list() - - pred_col1 <- sprintf("%s_SCORE", predClasses[1]) - pred_col2 <- sprintf("%s_SCORE", predClasses[2]) - - idx1 <- which(colnames(dat) == pred_col1) - idx2 <- which(colnames(dat) == pred_col2) - pred <- ROCR::prediction(dat[, idx1] - dat[, idx2], - dat$STATUS == predClasses[1]) - - c1 <- predClasses[1] #numc[1] - tp <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS == c1) - tn <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS != c1) - fp <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS != c1) - fn <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS == c1) - - # entire curves - curRoc <- ROCR::performance(pred, "tpr", "fpr") - curPr <- ROCR::performance(pred, "prec", "rec") - tmp <- data.frame(score = 0, tp = tp, tn = tn, fp = fp, fn = fn) - out <- perfCalc(tmp) - - # statistic - auroc <- performance(pred, "auc")@y.values[[1]] - aupr <- prauc(curPr) - corr <- sum(dat$STATUS == dat$PRED_CLASS) - overall_acc <- c(overall_acc, corr/nrow(dat) * 100) - - ### TODO put in F1. - mega[[ctr]] <- list(stats = out$stats, roc_curve = curRoc, - pr_curve = curPr, - auroc = auroc, aupr = aupr, accuracy = overall_acc) + + pracma::trapz(x, y) + } + + if (is.null(resList)) { + resList <- list(); + ctr <- 1 + for (fName in inFiles) { + resList[[ctr]] <- read.delim(fName, + sep = "\t", header = TRUE, as.is = TRUE) + ctr <- ctr + 1 } - - .plotAvg <- function(res, name,plotSEM) { - mu <- mean(res, na.rm = TRUE) - if (plotSEM) { - err <- sd(res, na.rm = TRUE)/sqrt(length(res)) - errnm <- "SEM" - } else { - err <- sd(res, na.rm = TRUE) - errnm <- "SD" -} - plot(1, mu, type = "n", bty = "n", - ylab = sprintf("%s (mean+/-%s)", name,errnm), - xaxt = "n", ylim = c(0.4, 1), las = 1, - xlim = c(0.8,1.2), - cex.axis = 1.4, xlab = "") - abline(h = c(0.7, 0.8), col = "cadetblue3", lty = 3, lwd = 3) - points(1, mu, type = "p", cex = 1.4, pch = 16) - - # error bars - segments(x0 = 1, y0 = mu - err, y1 = mu + err, lwd = 3) - segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu - err, y1 = mu - err) - segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu + err, y1 = mu + err) - abline(h = 0.5, col = "red", lty = 1, lwd = 2) - title(sprintf("%s: N=%i runs", name, length(res))) + } + + mega <- list() + for (ctr in seq_len(length(resList))) { + dat <- resList[[ctr]] + out <- list() + overall_acc <- numeric() + curRoc <- list() + curPr <- list() + + pred_col1 <- sprintf("%s_SCORE", predClasses[1]) + pred_col2 <- sprintf("%s_SCORE", predClasses[2]) + + idx1 <- which(colnames(dat) == pred_col1) + idx2 <- which(colnames(dat) == pred_col2) + pred <- ROCR::prediction(dat[, idx1] - dat[, idx2], + dat$STATUS == predClasses[1]) + + c1 <- predClasses[1] #numc[1] + tp <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS == c1) + tn <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS != c1) + fp <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS != c1) + fn <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS == c1) + + # entire curves + curRoc <- ROCR::performance(pred, "tpr", "fpr") + curPr <- ROCR::performance(pred, "prec", "rec") + tmp <- data.frame(score = 0, tp = tp, tn = tn, fp = fp, fn = fn) + out <- perfCalc(tmp) + + # statistic + auroc <- performance(pred, "auc")@y.values[[1]] + aupr <- prauc(curPr) + corr <- sum(dat$STATUS == dat$PRED_CLASS) + overall_acc <- c(overall_acc, corr / nrow(dat) * 100) + + ### TODO put in F1. + mega[[ctr]] <- list(stats = out$stats, roc_curve = curRoc, + pr_curve = curPr, + auroc = auroc, aupr = aupr, accuracy = overall_acc) + } + + .plotAvg <- function(res, name, plotSEM) { + mu <- mean(res, na.rm = TRUE) + if (plotSEM) { + err <- sd(res, na.rm = TRUE) / sqrt(length(res)) + errnm <- "SEM" + } else { + err <- sd(res, na.rm = TRUE) + errnm <- "SD" } - - # plot average +/-error + plot(1, mu, type = "n", bty = "n", + ylab = sprintf("%s (mean+/-%s)", name, errnm), + xaxt = "n", ylim = c(0.4, 1), las = 1, + xlim = c(0.8, 1.2), + cex.axis = 1.4, xlab = "") + abline(h = c(0.7, 0.8), col = "cadetblue3", lty = 3, lwd = 3) + points(1, mu, type = "p", cex = 1.4, pch = 16) + + # error bars + segments(x0 = 1, y0 = mu - err, y1 = mu + err, lwd = 3) + segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu - err, y1 = mu - err) + segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu + err, y1 = mu + err) + abline(h = 0.5, col = "red", lty = 1, lwd = 2) + title(sprintf("%s: N=%i runs", name, length(res))) + } + + # plot average +/-error + if (drawPlot) { par(mfrow = c(2, 2)) x <- unlist(lapply(mega, function(x) x$auroc)) - .plotAvg(x, "AUROC",plotSEM) + .plotAvg(x, "AUROC", plotSEM) x <- unlist(lapply(mega, function(x) x$aupr)) - .plotAvg(x, "AUPR",plotSEM) - - # plot individual curves + .plotAvg(x, "AUPR", plotSEM) + } + + # plot individual curves + if (drawPlot) { rocCurves <- lapply(mega, function(x) x$roc_curve) plotPerf_multi(rocCurves, "ROC") prCurves <- lapply(mega, function(x) x$pr_curve) plotPerf_multi(prCurves, "PR", plotType = "PR") - - return(mega) + } + + return(mega) } diff --git a/R/predict.R b/R/predict.R index 4fd8ee0a..0df38814 100644 --- a/R/predict.R +++ b/R/predict.R @@ -5,12 +5,12 @@ #' @param testMAE (MultiAssayExperiment) new patient dataset for testing model. Assays must be the same as for trainMAE. #' @param groupList (list) list of features used to train the model. Keys are data types, and values are lists for groupings within those datatypes. #' e.g. keys could include {'clinical','rna','methylation'}, and values within 'rna' could include pathway names {'cell cycle', 'DNA repair'}, etc., -#' featSel will be used to subset -#' @param featSel (list) selected features to be used in the predictive model. +#' selectedFeatures will be used to subset +#' @param selectedFeatures (list) selected features to be used in the predictive model. #' keys are patient labels (e.g. "responder/nonresponder"), and values are feature names #' identified by running buildPredictor(). Feature names must correspond to names of groupList, from which they will be subset. #' @param makeNetFunc (function) function to create PSN features from patient data. See makeNetFunc in buildPredictor() for details -#' @param impute (logical) if TRUE imputes train and test samples separately before creating features. Currently unsupported. +#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc. #' @param outDir (char) directory for results #' @param verbose (logical) print messages #' @param numCores (integer) number of CPU cores for parallel processing @@ -20,19 +20,20 @@ #' columns are: 1) ID, 2) STATUS (ground truth), 3)