diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index a21d18af..1ffccd1b 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -39,7 +39,7 @@ env:
   run_pkgdown: 'false'
   has_RUnit: 'false'
   cache-version: 'cache-v1'
-  run_docker: 'true'
+  run_docker: 'false'
 
 jobs:
   build-check:
@@ -52,9 +52,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: '4.1', bioc: '3.14', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
-          - { os: macOS-latest, r: '4.1', bioc: '3.14'}
-          - { os: windows-latest, r: '4.1', bioc: '3.14'}
+          - { os: ubuntu-latest, r: '4.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: macOS-latest, r: '4.2', bioc: '3.16'}
+          - { os: windows-latest, r: '4.2', bioc: '3.16'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -79,12 +79,12 @@ jobs:
       ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
       ## If they update their steps, we will also need to update ours.
       - name: Checkout Repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       ## R is already included in the Bioconductor docker images
       - name: Setup R from r-lib
         if: runner.os != 'Linux'
-        uses: r-lib/actions/setup-r@master
+        uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}
@@ -92,7 +92,7 @@ jobs:
       ## pandoc is already included in the Bioconductor docker images
       - name: Setup pandoc from r-lib
         if: runner.os != 'Linux'
-        uses: r-lib/actions/setup-pandoc@master
+        uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |
@@ -102,19 +102,19 @@ jobs:
 
       - name: Restore R package cache
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
-        uses: actions/cache@v2
+        uses: actions/cache@v3
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
-        uses: actions/cache@v2
+        uses: actions/cache@v3
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.1-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
@@ -176,7 +176,7 @@ jobs:
           gha_repos <- if(
               .Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin"
           ) c(
-              "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.14/bioc",
+              "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.16/bioc",
               BiocManager::repositories()
               ) else BiocManager::repositories()
 
@@ -218,7 +218,7 @@ jobs:
       - name: Install pkgdown
         if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
         run: |
-          remotes::install_github("r-lib/pkgdown")
+          remotes::install_cran("pkgdown")
         shell: Rscript {0}
 
       - name: Session info
@@ -236,7 +236,7 @@ jobs:
           options(crayon.enabled = TRUE)
           rcmdcheck::rcmdcheck(
               args = c("--no-manual", "--no-vignettes", "--timings"),
-              build_args = c("--no-manual", "--keep-empty-dirs", "--resave-data"),
+              build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
               error_on = "warning",
               check_dir = "check"
           )
@@ -275,31 +275,47 @@ jobs:
         if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
         run: R CMD INSTALL .
 
-      - name: Build and deploy pkgdown site
+      - name: Build pkgdown site
         if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
-        run: |
-          git config --local user.name "$GITHUB_ACTOR"
-          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
-          Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)"
-        shell: bash {0}
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
         ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
         ## at least one locally before this will work. This creates the gh-pages
         ## branch (erasing anything you haven't version controlled!) and
         ## makes the git history recognizable by pkgdown.
 
+      - name: Install deploy dependencies
+        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
+        run: |
+          apt-get update && apt-get -y install rsync
+
+      - name: Deploy pkgdown site to GitHub pages 🚀
+        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
+        uses: JamesIves/github-pages-deploy-action@releases/v4
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
+
       - name: Upload check results
         if: failure()
         uses: actions/upload-artifact@master
         with:
-          name: ${{ runner.os }}-biocversion-devel-r-4.1-results
+          name: ${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-results
           path: check
 
-      - name: Build & push Docker image v5
-        if: runner.os == 'Linux'
-        uses: mr-smithers-excellent/docker-build-push@v5   
+        ## Note that DOCKER_PASSWORD is really a token for your dockerhub
+        ## account, not your actual dockerhub account password.
+        ## This comes from
+        ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html#6-add-secrets-to-github-repo
+        ## Check https://github.com/docker/build-push-action/tree/releases/v1
+        ## for more details.
+      - uses: docker/build-push-action@v1
+        if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && runner.os == 'Linux' "
         with:
-          image: shraddhapai/netdx_devenv          
-          tags: latest
-          registry: docker.io
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
+          repository: realpailab/netdx
+          tag_with_ref: true
+          tag_with_sha: true
+          tags: latest
diff --git a/.github/workflows/push-docker.yml b/.github/workflows/push-docker.yml
new file mode 100644
index 00000000..7a9aed3e
--- /dev/null
+++ b/.github/workflows/push-docker.yml
@@ -0,0 +1,24 @@
+name: Docker Build
+
+on:
+  push:
+    branches: [ master ]
+
+jobs: 
+  build: 
+
+    runs-on: ubuntu-latest
+    
+    steps: 
+    - uses: actions/checkout@v2
+      name: Check out code
+
+    - uses: mr-smithers-excellent/docker-build-push@v5
+      name: Build and push Docker image
+      with:
+        image: realpailab/netdx 
+        registry: docker.io
+        addLatest: 'true'
+        addTimestamp: 'true'        
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index a535e8d7..ff7e0028 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 .Ruserdata
 doc
 Meta
+inst/doc
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..cdfe39c7
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,30 @@
+# YAML 1.2
+---
+authors: 
+  -
+    family-names: Pai
+    given-names: Shraddha
+  -
+    family-names: Shah
+    given-names: Ahmad
+  -
+    family-names: Hui
+    given-names: Shirley
+  -
+    family-names: Isserlin
+    given-names: Ruth
+  -
+    family-names: Kaka
+    given-names: Hussam
+  -
+    family-names: Bader
+    given-names: Gary
+cff-version: "1.1.0"
+date-released: 2019
+doi: "10.15252/msb.20188497"
+license: MIT
+message: "If you use this software, please cite it using these metadata."
+repository-code: "https://github.com/RealPaiLab/netDx"
+title: "netDx: Network-based patient classifier"
+version: "1.5.5"
+...
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index b03de76d..8dfb51f6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: netDx
 Title: Network-based patient classifier
-Version: 1.5.3
+Version: 1.7.1
 Authors@R: c(person("Shraddha", "Pai", 
 		email = "shraddha.pai@utoronto.ca", 
 		role = c("aut", "cre"),
@@ -9,6 +9,8 @@ Authors@R: c(person("Shraddha", "Pai",
 	person("Ahmad","Shah", role="aut"),
 	person("Luca","Giudice",role="aut"),
 	person("Shirley","Hui",role="aut"),
+	person("Anne","Nøhr",role="ctb"),
+	person("Indy","Ng",role="ctb"),
 	person("Ruth","Isserlin",role="aut"),
 	person("Hussam","Kaka", role="aut"),
 	person("Gary","Bader",role="aut"))
@@ -17,15 +19,11 @@ Depends:
     R (>= 3.6)
 Suggests: 
   curatedTCGAData,
-	TCGAutils,
 	rmarkdown,
 	testthat,
 	knitr,
 	BiocStyle,
-	RCy3,
-	clusterExperiment,
-	netSmooth,
-	scater
+	RCy3
 Imports: ROCR,pracma,ggplot2,glmnet,igraph,reshape2,
 	parallel,stats,utils,MultiAssayExperiment,graphics,grDevices,
 	methods,BiocFileCache,GenomicRanges,
diff --git a/NAMESPACE b/NAMESPACE
index 94091be0..e3a6ed0f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -12,9 +12,12 @@ export(compileFeatureScores)
 export(compileFeatures)
 export(confusionMatrix)
 export(convertProfileToNetworks)
+export(convertToMAE)
 export(countIntType)
 export(countIntType_batch)
 export(countPatientsInNet)
+export(createInputForFeatureNetworkView)
+export(createNetFuncFromSimList)
 export(createPSN_MultiData)
 export(dataList2List)
 export(enrichLabelNets)
@@ -34,7 +37,6 @@ export(getPerformance)
 export(getRegionOL)
 export(getResults)
 export(getSimilarity)
-export(makeInputForEnrichmentMap)
 export(makePSN_NamedMatrix)
 export(makePSN_RangeSets)
 export(makeQueries)
@@ -42,7 +44,6 @@ export(makeSymmetric)
 export(mapNamedRangesToSets)
 export(normDiff)
 export(perfCalc)
-export(plotEmap)
 export(plotIntegratedPatientNetwork)
 export(plotPerf)
 export(plotPerf_multi)
@@ -59,14 +60,14 @@ export(setupFeatureDB)
 export(sim.eucscale)
 export(sim.pearscale)
 export(simpleCap)
-export(smoothMutations_LabelProp)
 export(sparsify2)
 export(sparsify3)
 export(splitTestTrain)
 export(splitTestTrain_resampling)
+export(subsampleValidationData)
 export(tSNEPlotter)
-export(thresholdSmoothedMutations)
 export(updateNets)
+export(viewSelectedFeaturesAsNetworks)
 export(writeNetsSIF)
 export(writeQueryBatchFile)
 export(writeQueryFile)
diff --git a/NEWS b/NEWS
index 9f0ed121..5220ade7 100644
--- a/NEWS
+++ b/NEWS
@@ -1,10 +1,32 @@
-netDx 1.5.3
+3.14 RELEASE 
+==================
+
+netDx 1.7.1
+==================
+* Added vignette showing how to view top features as connected networks in Cytoscape (Enrichment Map). 
+* Improved Java support. Includes:
+  * Prompt to users to install Java if not installed, and requirement check for Java 16 or lower.
+  * Feature selection with Java 12+ resulted in an "illegal access" error due to a change in how Java handles reflection. netDx sidesteps
+  the error for Java 9-16 but cannot do so for Java 17. Java 12-16 should now work with feature selection.
+* plotPerf() now provides the option to not plot the ROC and PR curves with a drawPlot parameter.
+
+netDx 1.5.10
+==================
+* Functions to create network visualizations of top-scoring functions renamed to better reflect function.
+  * plotEMap() -> viewSelectedFeaturesAsNetworks 
+  * makeInputForEnrichmentMap() -> createInputForFeatureNetworkView()
+
+netDx 1.5.9
+==================
+* Ability to smooth somatic mutations over an interaction network via (smoothMutations_LabelProp) has been temporarily removed. This function made software install challenging on machines without C++ compilers as it relies on clusterExperiment. The latter can only be compiled from source. This functionality will be added back when doing so does not complicate the install. It is unclear whether users want to routinely use this functionality.
+
+netDx 1.5.8
 ==================
 * Moved RCy3, scater, clusterExperiment and netSmooth to "Suggests" to reduce dependency burden
 * Sped up vignettes by limiting all to binary classification and limiting number of layers
 * Removed TL;DR from vignettes as usefulness in question but maintainance high.
 
-Developers notes:
+Developer notes:
 -------------------
 * Added Dockerfile and Github Actions for automated testing
 * GHA auto-generates a Docker image with netDx which gets pushed to shraddhapai/netdx_devenv
diff --git a/R/buildPredictor.R b/R/buildPredictor.R
index a84fcebb..e1e7886f 100644
--- a/R/buildPredictor.R
+++ b/R/buildPredictor.R
@@ -29,6 +29,9 @@
 #' So keys(groupList[["rna"]]) would have pathway names, generating one PSN
 #' per pathways, and values(groupList[["rna"]]) would be genes that would be
 #' grouped for the corresponding pathwayList.
+#' @param sims (list) rules to create similarity networks from input data. Keys are names of
+#' data layers and should be identical to names(groupList). Values is either a character
+#' for built-in similarity functions; call allowedSims() to see full list; or a custom function.
 #' @param makeNetFunc (function) user-defined function for creating the set
 #' of input PSN provided to netDx. See createPSN_MultiData()::customFunc.
 #' @param outDir (char) directory where results will be stored. If this
@@ -169,9 +172,10 @@
 #' # takes 10 minutes to run
 #' #out <- buildPredictor(dataList=brca,groupList=groupList,
 #' #   makeNetFunc=makeNets, ### custom network creation function
-#' #   outDir=paste(tempdir(),"pred_output",sep=getFileSep()), ## absolute path
+#' #   outDir=paste(normalizePath(tempdir()),"pred_output",sep=getFileSep()), ## absolute path
 #' #   numCores=16L,featScoreMax=2L, featSelCutoff=1L,numSplits=2L)
-buildPredictor <- function(dataList,groupList,outDir=tempdir(),makeNetFunc,
+buildPredictor <- function(dataList,groupList,outDir=normalizePath(tempdir()),
+	makeNetFunc=NULL,sims=NULL,
 	featScoreMax=10L,trainProp=0.8,numSplits=10L,numCores,JavaMemory=4L,
 	featSelCutoff=9L,keepAllData=FALSE,startAt=1L, preFilter=FALSE,
 	impute=FALSE,preFilterGroups=NULL, imputeGroups=NULL,logging="default",
@@ -196,7 +200,7 @@ if (logging == "all") {
 	verbose_predict <- FALSE
 }
 
-# Check input
+# Check input - error handling
 if (missing(dataList)) stop("dataList must be supplied.\n")
 if (missing(groupList)) stop("groupList must be supplied.\n")
 if (length(groupList)<1) stop("groupList must be of length 1+\n")
@@ -213,6 +217,9 @@ if (!is(groupList,"list") || not_list || names_nomatch ) {
 	stop(paste(msg,sep=""))
 }
 
+# checks either/or provided, sets missing var to NULL
+x <- checkMakeNetFuncSims(makeNetFunc=makeNetFunc, sims=sims,groupList=groupList)
+
 if (!is(dataList,"MultiAssayExperiment"))
 	stop("dataList must be a MultiAssayExperiment")
 
@@ -220,6 +227,8 @@ if (trainProp <= 0 | trainProp >= 1)
 		stop("trainProp must be greater than 0 and less than 1")
 if (startAt > numSplits) stop("startAt should be between 1 and numSplits")
 
+# end check input error handling
+
 megaDir <- outDir
 if (file.exists(megaDir)) {
  	stop(paste("outDir seems to already exist!",
@@ -247,8 +256,13 @@ for (k in seq_len(length(exprs))) {
 	tmp <- exprs[[k]]
 	df <- sampleMap(dataList)[which(sampleMap(dataList)$assay==names(exprs)[k]),]
 	colnames(tmp) <- df$primary[match(df$colname,colnames(tmp))]
-	tmp <- as.matrix(assays(tmp)[[1]]) # convert to matrix
-	datList2[[names(exprs)[k]]]<- tmp	
+	if ("matrix" %in% class(tmp)) {
+		datList2[[names(exprs)[k]]] <- tmp
+	} else {
+		tmp <- as.matrix(assays(tmp)[[1]]) # convert to matrix
+		datList2[[names(exprs)[k]]]<- tmp	
+	}
+
 }
 if ("clinical" %in% names(groupList)) {
 	tmp <- colData(dataList)
@@ -274,7 +288,6 @@ if (verbose_default){
 	}
 }
 
-
 outList <- list()
 
 # create master list of possible networks
@@ -289,8 +302,14 @@ colnames(tmp) <- c("NetType","NetName")
 outList[["inputNets"]] <- tmp
 
 if (verbose_default) {
-	message("\n\nCustom function to generate input nets:")
-	print(makeNetFunc)
+	if (!is.null(makeNetFunc)){
+		message("\n\nCustom function to generate input nets:")
+		print(makeNetFunc)
+		
+	} else {
+		message("Similarity metrics provided:")
+		print(sims)
+	}
 	message(sprintf("-------------------------------\n"))
 }
 
@@ -386,8 +405,8 @@ for (rngNum in startAt:numSplits) {
 
 	if (verbose_default) message("** Creating features")
 	createPSN_MultiData(dataList=dats_train,groupList=groupList,
-			pheno=pheno_id,
-			netDir=netDir,customFunc=makeNetFunc,numCores=numCores,
+			pheno=pheno_id, 
+			netDir=netDir,makeNetFunc=makeNetFunc,sims=sims, numCores=numCores,
 			verbose=verbose_makeFeatures)
 	if (verbose_default) message("** Compiling features")
 	dbDir <- compileFeatures(netDir,outDir, numCores=numCores, 
@@ -512,7 +531,8 @@ for (rngNum in startAt:numSplits) {
 		pheno_id <- setupFeatureDB(pheno,netDir)
 		createPSN_MultiData(dataList=dats_tmp,groupList=groupList,
 			pheno=pheno_id,
-			netDir=netDir,customFunc=makeNetFunc,numCores=numCores,
+			netDir=netDir,makeNetFunc=makeNetFunc,sims=sims, 
+			numCores=numCores,
 			filterSet=pTally,verbose=verbose_default)
 		dbDir <- compileFeatures(netDir,outDir=pDir,numCores=numCores,
 			verbose=verbose_compileNets,debugMode=debugMode)
diff --git a/R/buildPredictor_sparseGenetic.R b/R/buildPredictor_sparseGenetic.R
index ea7b9839..0a2ff878 100644
--- a/R/buildPredictor_sparseGenetic.R
+++ b/R/buildPredictor_sparseGenetic.R
@@ -117,7 +117,7 @@
 #'    name=genes[,4])
 #' 
 #' # create GRangesList of pathways
-#' pathFile <- fetchPathwayDefinitions("February",2018,verbose=TRUE)
+#' pathFile <- fetchPathwayDefinitions("February",2021,verbose=TRUE)
 #' pathwayList <- readPathways(pathFile)
 #' path_GRList <- mapNamedRangesToSets(gene_GR,pathwayList)
 #' 
@@ -131,7 +131,7 @@
 #' #head(out$cumulativeFeatScores)
 #' 
 buildPredictor_sparseGenetic <- function(phenoDF,cnv_GR,predClass,
-	group_GRList,outDir=tempdir(),
+	group_GRList,outDir=normalizePath(tempdir()),
 	numSplits=3L, featScoreMax=10L,
 	filter_WtSum=100L,
 	enrichLabels=TRUE,enrichPthresh=0.07,numPermsEnrich=2500L,minEnr=-1,
diff --git a/R/compileFeatures.R b/R/compileFeatures.R
index f5bb54b0..fb9ceb90 100644
--- a/R/compileFeatures.R
+++ b/R/compileFeatures.R
@@ -51,21 +51,21 @@
 #'					writeProfiles=TRUE,...)
 #'     unlist(netList)
 #' }
-#' tmpDir <- tempdir(); netDir <- paste(tmpDir,"nets",
+#' tmpDir <- normalizePath(tempdir()); netDir <- paste(tmpDir,"nets",
 #'	sep=getFileSep())
 #' if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
 #' dir.create(netDir,recursive=TRUE)
 #' 
 #' pheno_id <- setupFeatureDB(pheno,netDir)
 #' netList <- createPSN_MultiData(dataList=dataList, groupList=groupList,
-#'     pheno=pheno_id,netDir=netDir,customFunc=makeNets,verbose=TRUE)
+#'     pheno=pheno_id,netDir=netDir,makeNetFunc=makeNets,verbose=TRUE)
 #' 
 #' outDir <- paste(tmpDir,'dbdir',sep=getFileSep()); 
 #'	dir.create(outDir)
 #' dbDir <- compileFeatures(netDir,outDir)
 #' @import doParallel
 #' @export
-compileFeatures <- function(netDir, outDir = tempdir(),
+compileFeatures <- function(netDir, outDir = normalizePath(tempdir()),
     simMetric = "pearson",
     netSfx = "txt$", verbose = TRUE, numCores = 1L,
     P2N_threshType = "off", P2N_maxMissing = 100,
@@ -125,14 +125,17 @@ compileFeatures <- function(netDir, outDir = tempdir(),
 
     curProf <- ""
     `%myinfix%` <- ifelse(debugMode, `%do%`, `%dopar%`)
+
     foreach(curProf = dir(path = profDir, pattern = "profile$")) %myinfix% {
       args2 <- c("-in", paste(profDir, curProf, sep = getFileSep()))
       args2 <- c(args2, "-out",
-    paste(netOutDir, sub(".profile", ".txt", curProf),
+      paste(netOutDir, sub(".profile", ".txt", curProf),
       sep = getFileSep()))
+
       args2 <- c(args2, "-syn",
     paste(netDir, "1.synonyms", sep = getFileSep()),
       "-keepAllTies", "-limitTies")
+
       if (debugMode) {
         message("Making Java call")
         tmp <- paste(c(args, args2), collapse = " ")
@@ -214,18 +217,19 @@ compileFeatures <- function(netDir, outDir = tempdir(),
   unlink(olddir)
 
   # Check: need to replace commas used as decimal separators, into periods
-  tmp <- dir(path = sprintf("%s/INTERACTIONS", netDir), pattern = "txt$")[1]
-  tmp <- sprintf("%s/INTERACTIONS/%s", netDir, tmp)
+  tmp <- dir(path = paste(netDir,"INTERACTIONS", sep=getFileSep()), pattern = "txt$")[1]
+  tmp <- sprintf(paste(netDir,"INTERACTIONS",tmp,sep=getFileSep()))
   if (sum(grepl(pattern = ",", readLines(tmp, n = 1)) > 0)) {
     # detect comma
-    replacePattern(path = sprintf("%s/INTERACTIONS", netDir))
+    replacePattern(path = paste(netDir,"INTERACTIONS", sep=getFileSep()))
   }
 
   # Build GeneMANIA cache
   if (verbose)
     message("\t* Build GeneMANIA cache")
 
-  args <- c("-Xmx10G", "-cp", GM_jar,
+  args <- c("--illegal-access=permit", # needed for Java 9-16. Deprecated in Java 17
+      "-Xmx10G", "-cp", GM_jar,
         "org.genemania.engine.apps.CacheBuilder")
   args <- c(args, "-cachedir", "cache", "-indexDir", ".",
         "-networkDir",
diff --git a/R/convertToMAE.R b/R/convertToMAE.R
new file mode 100644
index 00000000..0e81e3bc
--- /dev/null
+++ b/R/convertToMAE.R
@@ -0,0 +1,82 @@
+#' Wrapper that converts an input list into a MultiAssayExperiment object
+#' 
+#' @details This function takes in a list of key-value pairs (keys: data types,
+#' values: matrices/dataframes) and calls the necessary functions from the
+#' MultiAssayExperiment package to incorporate the values from the input list 
+#' into a MultiAssayExperiment object, transforming the values according to the 
+#' keys. If duplicate sample names are found in the assay data, only the first
+#' instance is kept.
+#' @param dataList  (list) input key-value pairs (keys: data types, values: 
+#' data in the form of matrices/dataframes); must have a key-value pair that
+#' corresponds to patient IDs/metadata labelled pheno.
+#' @return MAE (MultiAssayExperiment) data from input list incorporated into a
+#' MultiAssayExperiment object, compatible with further analysis using the 
+#' netDx algorithm.
+#' @examples
+#' data(xpr, pheno)
+#' 
+#' # Generate random proteomic data
+#' prot <- matrix(rnorm(100*20), ncol=20)
+#' colnames(prot) <- sample(pheno$ID, 20)
+#' rownames(prot) <- sprintf("protein%i",1:100)	
+#' 
+#' myList <- list(rna = xpr, proteomic = prot, pheno = pheno)
+#' 
+#' MAE <- convertToMAE(myList)
+#' @export
+convertToMAE <- function(dataList) {
+  
+  # Check input data:
+  if (class(dataList) != "list") {
+    stop("dataList must be a list. \n")
+  }
+  if (is.null(dataList$pheno)) {
+    stop("dataList must have key-value pair labelled pheno.\n")
+  }
+  if (length(dataList) == 1) {
+    stop("dataList must have assay data to incorporate into a 
+         MultiAssayExperiment object")
+  }
+  
+  # Note that a MultiAssayExperiment object requires an ExperimentList and 
+  # colData (sampleMap optional if each assay uses the same colnames)
+  
+  # Possible elements for ExperimentList:
+  # - base::matrix (gene expression, microRNA, metabolomics, microbiome data)
+  # - SummarizedExperiment::SummarizedExperiment (same as matrix, but capable
+  #   of storing additional assay-level metadata)
+  # - Biobase::ExpressionSet (legacy representation, use SummarizedExperiment)
+  # - SummarizedExperiment::RangedSummarizedExperiment (range-based datasets; 
+  #   gene expression, methylation, data types that refer to genomic positions)
+  # - RaggedExperiment::RaggedExperiment (range-based datasets; copy number and
+  #   mutation data, measurements by genomic positions)
+  
+  # Assumes that pheno is a DataFrame (or coerceable to be a DataFrame)
+  patientPheno <- dataList$pheno
+  
+  # Generate ExperimentList from input dataList
+  tmp <- NULL
+  track <- c()
+  datType <- names(dataList)
+  for (k in 1:length(dataList)) {
+    # For key-value pairs that aren't labelled pheno, transform into 
+    # objects compatible with input into MultiAssayExperiment object
+    if (names(dataList[k]) != "pheno") {
+      
+      # Remove duplicated columns (we keep the first column) in the assay data
+      if (sum(duplicated(colnames(dataList[[k]]))) != 0) {
+        dataList[[k]] <- dataList[[k]][,!duplicated(colnames(dataList[[k]]))]
+      }
+      
+      # Assumes that data is of matrix class 
+      # *(maybe implement matrix conversion into SummarizedExperiment in future)
+      track <- c(track, k)
+      tmp <- c(tmp, list(dataList[[k]]))
+    }
+  }
+  names(tmp) <- datType[track]
+  
+  MAE <- MultiAssayExperiment(experiments = tmp, colData = patientPheno)
+  
+  return(MAE)
+}
\ No newline at end of file
diff --git a/R/createPSN_MultiData.R b/R/createPSN_MultiData.R
index ea6e976c..719497cd 100644
--- a/R/createPSN_MultiData.R
+++ b/R/createPSN_MultiData.R
@@ -12,11 +12,15 @@
 #' with internally-generated identifiers.
 #' @param netDir (char) path to directory where networks will be stored
 #' @param filterSet (char) vector of networks to include
-#' @param customFunc (function) custom user-function to create PSN. 
+#' @param makeNetFunc (function) custom user-function to create PSN. 
 #' Must take dataList,groupList,netDir as parameters. Must
 #' check if a given groupList is empty (no networks to create) before 
 #' the makePSN call for it. This is to avoid trying to make nets for datatypes
 #' that did not pass feature selection
+#' @param sims (list) Similarity metric settings for patient data. 
+#' Keys must be identical to those of groupList. 
+#' Values are either of type character, used for built-in similarity functions, 
+#' or are functions, when a custom function is provided.
 #' @param verbose (logical) print messages
 #' @param ... other parameters to makePSN_NamedMatrix() or makePSN_RangedSets()
 #' @return (char) vector of network names. Side effect of creating the nets
@@ -95,24 +99,24 @@
 #' pheno_id <- setupFeatureDB(colData(brca),netDir)
 #' createPSN_MultiData(dataList=datList2,groupList=groupList,
 #'  pheno=pheno_id,
-#'  netDir=netDir,customFunc=makeNets,numCores=1)
+#'  netDir=netDir,makeNetFunc=makeNets,numCores=1)
 #' @export
 createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(), 
 		filterSet = NULL, 
-    verbose = TRUE, customFunc, ...) {
+    verbose = TRUE, makeNetFunc=NULL, sims=NULL, ...) {
     
     if (missing(dataList)) 
         stop("dataList must be supplied.\n")
     if (missing(groupList)) 
         stop("groupList must be supplied.\n")
-    
+ 
     # resolve user-provided IDs with internal IDs
     dataList <- lapply(dataList, function(x) {
         midx <- match(colnames(x), pheno$ID)
         colnames(x) <- pheno$INTERNAL_ID[midx]
         x
     })
-    
+
     if (!is.null(filterSet)) {
         if (length(filterSet) < 1) {
           s1 <- "filterSet is empty."
@@ -120,8 +124,8 @@ createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(),
         	stop(paste(s1, s2, sep = " "))
 				}
     }
-    if (missing(customFunc)) 
-        stop("customFunc must be suppled.\n")
+
+    
     
     # Filter for nets (potentially feature-selected ones)
     if (!is.null(filterSet)) {
@@ -139,12 +143,22 @@ createPSN_MultiData <- function(dataList, groupList, pheno, netDir=tempdir(),
             }
         }
         groupList <- groupList2
+        sims <- sims[which(names(sims) %in% names(groupList))]
         rm(groupList2)
     }
     
+    if (!is.null(makeNetFunc)){
     # call user-defined function for making PSN
-    netList <- customFunc(dataList = dataList, groupList = groupList, 
+        netList <- makeNetFunc(dataList = dataList, groupList = groupList, 
 				netDir = netDir, ...)
+    } else {
+        netList <- createNetFuncFromSimList(dataList=dataList,
+            groupList = groupList, 
+            netDir = netDir,
+            sims = sims, 
+            ...
+            )
+    }
     
     if (length(netList) < 1) 
         stop("\n\nNo features created! Filters may be too stringent.\n")
diff --git a/R/fileCache.R b/R/fileCache.R
index d56c15f9..187afb11 100644
--- a/R/fileCache.R
+++ b/R/fileCache.R
@@ -19,16 +19,25 @@
 #' @export
 getGMjar_path <- function(verbose = FALSE) {
 
-	java_ver <- suppressWarnings(
+	java_ver <- suppressMessages(suppressWarnings(
 		system2("java", args="--version",stdout=TRUE,stderr=NULL)
-	)
-	if (any(grep(" 11",java_ver)) || any(grep(" 12",java_ver)) || any(grep(" 13",java_ver)) || any(grep(" 14",java_ver)) || any(grep(" 16",java_ver))) {
+	))
+  if (any(grep(" 11", java_ver)) ||
+    any(grep(" 12", java_ver)) ||
+    any(grep(" 13", java_ver)) ||
+    any(grep(" 14", java_ver)) ||
+    any(grep(" 16", java_ver)) ||
+	any(grep(" 17", java_ver)) || 
+	any(grep(" 18", java_ver)) ||
+	any(grep(" 19", java_ver)) ||
+	any(grep(" 20", java_ver)) 
+    ) {
 		if (verbose) message("Java 11+ detected")
-    	fileURL <- paste("https://download.baderlab.org/netDx/java11/", 
+    	fileURL <- paste("https://downloads.res.oicr.on.ca/pailab/netDx/java11/", 
 			"genemania-netdx.jar",sep="")
 	} else {
 		if (verbose) message("Java 8 detected")
-    	fileURL <- paste("https://download.baderlab.org/netDx/java8/", 
+    	fileURL <- paste("https://downloads.res.oicr.on.ca/pailab/netDx/java8/", 
 			"genemania-netdx.jar",sep="")
 	}
 	
@@ -45,30 +54,34 @@ getGMjar_path <- function(verbose = FALSE) {
 #' For details see Merico D, Isserlin R, Stueker O, Emili A and GD Bader.
 #' (2010). PLoS One. 5(11):e13984.
 #' @param verbose (logical) print messages
-#' @examples fetchPathwayDefinitions("October",2020)
+#' @examples fetchPathwayDefinitions("October",2021)
 #' @param day (integer)
 #' @param month (numeric or char) month of pathway definition file. Can be
 #' numeric or text (e.g. "January","April"). If NULL, fails.
 #' @param year (numeric) year of pathway definition file. Must be in
-#' yyyy format (e.g. 2018). If NULL, fails.
+#' yyyy format (e.g. 2020). If NULL, fails.
 #' @return (char) Path to local cached copy of GMT file
 #' or initial download is required 
 #' @importFrom httr HEAD
 #' @export
 #' @examples 
-#' fetchPathwayDefinitions("January",2018)
-#' fetchPathwayDefinitions(month=10,year=2020)
+#' fetchPathwayDefinitions("October",2021)
+#' fetchPathwayDefinitions(month=10,year=2021)
 fetchPathwayDefinitions <- function(month=NULL,year=NULL,day=1,verbose=FALSE){
 	if (is.null(month) || is.null(year)) {
 		stop("Please provide a month and year.")
 		#month <- month.name[as.integer(format(Sys.Date(),"%m"))]
 		#year <- as.integer(format(Sys.Date(),"%Y"))
 	}
+	if (year < 2020) {
+		stop("Currently, year must equal 2020 or greater.")
+	}
+
 	if (class(month) %in% c("numeric","integer")) {
 		month <- month.name[month]
 	}
 		pdate <- sprintf("%s_%02d_%i",month,day,year)
-    	pathwayURL <- paste("https://download.baderlab.org/EM_Genesets/", 
+    	pathwayURL <- paste("https://downloads.res.oicr.on.ca/pailab/public/EM_Genesets/", 
 		sprintf("%s/Human/symbol/",pdate),
         sprintf("Human_AllPathways_%s_symbol.gmt",pdate),
 		 sep = "")
@@ -79,7 +92,7 @@ fetchPathwayDefinitions <- function(month=NULL,year=NULL,day=1,verbose=FALSE){
 	if (chk$status_code==404) {
 		stop(paste(sprintf("The pathway file for %02d %s %i doesn't exist.",day,month,year),
 				"Select a different date. ",
-				"See https://download.baderlab.org/EM_Genesets/Human/symbol for options.",
+				"See https://downloads.res.oicr.on.ca/pailab/public/EM_Genesets/Human/symbol for options.",
 				sep=" "))
 	}
     bfcrpath(bfc, pathwayURL)
diff --git a/R/getSimilarity.R b/R/getSimilarity.R
index b104b34c..b2c8f941 100644
--- a/R/getSimilarity.R
+++ b/R/getSimilarity.R
@@ -18,6 +18,9 @@
 #' @importFrom stats cor
 #' @export
 getSimilarity <- function(x, type = "pearson", customFunc, ...) {
-    switch(type, pearson = round(cor(na.omit(x), method = "pearson"), 
-			digits = 3), custom = customFunc(x, ...))
+  switch(type,
+    pearson = round(cor(na.omit(x), method = "pearson"), 
+      digits = 3),
+    custom = customFunc(x, ...)
+    )
 }
diff --git a/R/helper.R b/R/helper.R
index e5e0bdec..0faa4697 100755
--- a/R/helper.R
+++ b/R/helper.R
@@ -12,6 +12,7 @@
 #' @param featureSelPct (numeric between 0 and 1) cutoff percent for feature selection.
 #' A feature must have minimum score of featureSelCutoff for featureSelPct of 
 #' train/test splits to pass.
+#' @param drawPerformancePlot (logical) if TRUE, draws AUROC and AUPR plots. Set to FALSE to suppress graphical output.
 #' @returns list of results.
 #' - selectedFeatures (list of character vectors): list, one per class
 #' - performance (list of mixed datatypes) including mean accuracy (meanAccuracy), 
@@ -24,65 +25,67 @@
 #' getResults(toymodel,patlabels,2,0.5)
 #' 
 #' @export
-getResults <- function(res, status, featureSelCutoff=1L, 
-    featureSelPct=0){
-
-numSplits <- length(grep("^Split",names(res)))
-st <- status
-message(sprintf("Detected %i splits and %i classes", numSplits, length(st)))
-
-acc <- c()         # accuracy
-predList <- list() # prediction tables
-featScores <- list() # feature scores per class
-for (cur in unique(st)) featScores[[cur]] <- list()
-
-# collect accuracy and feature scores
-for (k in 1:numSplits) {
-	pred <- res[[sprintf("Split%i",k)]][["predictions"]];
-	# predictions table
-	tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
-	                 sprintf("%s_SCORE",st))]
-	predList[[k]] <- tmp
-	# accuracy
-	acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
-	# feature scores
-	for (cur in unique(st)) {
-	   tmp <- res[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
-	   colnames(tmp) <- c("PATHWAY_NAME","SCORE")
-	   featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
-	}
-}
+getResults <- function(res, status, featureSelCutoff = 1L,
+    featureSelPct = 0, drawPerformancePlot = TRUE) {
 
-# only plot ROC and PR curves 
-auroc <- NULL; aupr <- NULL
-if (length(st)==2) {
-message("* Plotting performance")
-predPerf <- plotPerf(predList, predClasses=st)
-auroc <- unlist(lapply(predPerf, function(x) x$auroc))
-aupr <- unlist(lapply(predPerf, function(x) x$aupr))
-}
+  numSplits <- length(grep("^Split", names(res)))
+  st <- status
+  message(sprintf("Detected %i splits and %i classes", numSplits, length(st)))
 
-message("* Compiling feature scores and calling selected features")
-feats <- callOverallSelectedFeatures(featScores, 
+  acc <- c() # accuracy
+  predList <- list() # prediction tables
+  featScores <- list() # feature scores per class
+  for (cur in unique(st))
+    featScores[[cur]] <- list()
+
+  # collect accuracy and feature scores
+  for (k in 1:numSplits) {
+    pred <- res[[sprintf("Split%i", k)]][["predictions"]];
+    # predictions table
+    tmp <- pred[, c("ID", "STATUS", "TT_STATUS", "PRED_CLASS",
+                   sprintf("%s_SCORE", st))]
+    predList[[k]] <- tmp
+    # accuracy
+    acc <- c(acc, sum(tmp$PRED == tmp$STATUS) / nrow(tmp))
+    # feature scores
+    for (cur in unique(st)) {
+      tmp <- res[[sprintf("Split%i", k)]][["featureScores"]][[cur]]
+      colnames(tmp) <- c("PATHWAY_NAME", "SCORE")
+      featScores[[cur]][[sprintf("Split%i", k)]] <- tmp
+    }
+  }
+
+  # only plot ROC and PR curves 
+  auroc <- NULL;
+  aupr <- NULL
+  if (length(st) == 2) {
+    if (drawPerformancePlot) message("* Plotting performance")
+    predPerf <- plotPerf(predList, predClasses = st, drawPlot = drawPerformancePlot)
+    auroc <- unlist(lapply(predPerf, function(x) x$auroc))
+    aupr <- unlist(lapply(predPerf, function(x) x$aupr))
+  }
+
+  message("* Compiling feature scores and calling selected features")
+  feats <- callOverallSelectedFeatures(featScores,
     featureSelCutoff = featureSelCutoff,
     featureSelPct = featureSelPct,
     cleanNames = TRUE
-)
-
-#### Enrichment map
-###if (!is.null(pathwayList)){
-###    message("* Pathway List detected - creating input for EnrichmentMap")
-###    browser()
-###}
-
-return(list(
-    selectedFeatures=feats$selectedFeatures,
-    featureScores=feats$featScores,
-    performance=list(meanAccuracy=mean(acc),
-                    splitAccuracy=acc,
-                    splitAUROC=auroc,
-                    splitAUPR=aupr)
-))
+  )
+
+  #### Enrichment map
+  ###if (!is.null(pathwayList)){
+  ###    message("* Pathway List detected - creating input for EnrichmentMap")
+  ###    browser()
+  ###}
+
+  return(list(
+    selectedFeatures = feats$selectedFeatures,
+    featureScores = feats$featScores,
+    performance = list(meanAccuracy = mean(acc),
+                    splitAccuracy = acc,
+                    splitAUROC = auroc,
+                    splitAUPR = aupr)
+  ))
 
 }
 
@@ -125,30 +128,32 @@ return(list(
 #' featScores: (matrix) feature scores for each split
 #' selectedFeatures: (list) features passing selection for each class; one key per class
 #' @export
-callOverallSelectedFeatures <- function(featScores, featureSelCutoff, 
-    featureSelPct, cleanNames=TRUE){
-featScores2 <- lapply(featScores, getNetConsensus)
-if (cleanNames) {
-    featScores2 <- lapply(featScores2,function(x){
-        x$PATHWAY_NAME <- sub(".profile","",x$PATHWAY_NAME)
-        x$PATHWAY_NAME <- sub("_cont.txt","",x$PATHWAY_NAME)
-        colnames(x)[1] <- "Feature"
-        x
+callOverallSelectedFeatures <- function(featScores, featureSelCutoff,
+    featureSelPct, cleanNames = TRUE) {
+  featScores2 <- lapply(featScores, getNetConsensus)
+  if (cleanNames) {
+    featScores2 <- lapply(featScores2, function(x) {
+      x$PATHWAY_NAME <- sub(".profile", "", x$PATHWAY_NAME)
+      x$PATHWAY_NAME <- sub("_cont.txt", "", x$PATHWAY_NAME)
+      colnames(x)[1] <- "Feature"
+      x
     })
-}
-featSelNet <- lapply(featScores2, function(x) {
-    x <- callFeatSel(x, fsCutoff=featureSelCutoff, fsPctPass=featureSelPct)
-})
-
-return(list(
-    featScores=featScores2,
-    selectedFeatures=featSelNet
-))
+  }
+  featSelNet <- lapply(featScores2, function(x) {
+    x <- callFeatSel(x, fsCutoff = featureSelCutoff, fsPctPass = featureSelPct)
+  })
+
+  return(list(
+    featScores = featScores2,
+    selectedFeatures = featSelNet
+  ))
 }
 
 #' Wrapper to create input files for Enrichment Map
 #'
-#' @details An Enrichment Map is a network-based visualization of top-scoring pathway features
+#' @details Creates the input to visualize selected features and their relationships
+#' as a network in Cytoscape. The type of visualization is called an Enrichment Map.
+#' An Enrichment Map is a network-based visualization of top-scoring pathway features
 #' and themes. It is generated in Cytoscape. This script generates the input files needed
 #' for Cytoscape to create an Enrichment Map visualization.
 #' @param model (list) Output of training model, generated by running buildPredictor()
@@ -159,49 +164,51 @@ return(list(
 #' @param EMapPctPass (numeric between 0 and 1) percent of splits for which feature must have score in range
 #'  [EMapMinScore,EMapMaxScore] to be included for EnrichmentMap visualization
 #' @param outDir (char) directory where files should be written
-#' @return 
-#' @export
-makeInputForEnrichmentMap <- function(model,results,pathwayList,
-    EMapMinScore=0L, EMapMaxScore=1L,
-    EMapPctPass=0.5,outDir)
-{
-    featScores <- results$featureScores
-
-message("* Creating input files for EnrichmentMap")
-Emap_res <- getEMapInput_many(featScores,
+#' @return (list) 1) GMTfiles (char): GMT files used to create EnrichmentMap in Cytoscape.
+#' 2) NodeStyles (char): .txt files used to assign node attributes in Cytoscape. Importantly, 
+#' attributes include node fill, which indicates the highest consistent score for a given 
+#' feature. 
+#' @export 
+createInputForFeatureNetworkView <- function(model, results, pathwayList,
+    EMapMinScore = 0L, EMapMaxScore = 1L,
+    EMapPctPass = 0.5, outDir) {
+  featScores <- results$featureScores
+
+  message("* Creating input files for feature network view")
+  Emap_res <- getEMapInput_many(featScores,
     pathwayList,
-    minScore=EMapMinScore,
-    maxScore=EMapMaxScore,
-    pctPass=EMapPctPass,
+    minScore = EMapMinScore,
+    maxScore = EMapMaxScore,
+    pctPass = EMapPctPass,
     model$inputNets,
-    verbose=FALSE
-)
+    verbose = FALSE
+  )
 
-gmtFiles <- list()
-nodeAttrFiles <- list()
+  gmtFiles <- list()
+  nodeAttrFiles <- list()
 
-message("* Writing files for network visualization")
-for (g in names(Emap_res)) {
-    outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
-    write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
-        sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE)
+  message("* Writing files for network visualization")
+  for (g in names(Emap_res)) {
+    outFile <- paste(outDir, sprintf("%s_nodeAttrs.txt", g), sep = getFileSep())
+    write.table(Emap_res[[g]][["nodeAttrs"]], file = outFile,
+        sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
     nodeAttrFiles[[g]] <- outFile
 
-    outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
+    outFile <- paste(outDir, sprintf("%s.gmt", g), sep = getFileSep())
     conn <- suppressWarnings(
-         suppressMessages(base::file(outFile,"w")))
+         suppressMessages(base::file(outFile, "w")))
     tmp <- Emap_res[[g]][["featureSets"]]
     gmtFiles[[g]] <- outFile
 
     for (cur in names(tmp)) {
-        curr <- sprintf("%s\t%s\t%s", cur,cur,
-            paste(tmp[[cur]],collapse="\t"))
-        writeLines(curr,con=conn)
+      curr <- sprintf("%s\t%s\t%s", cur, cur,
+            paste(tmp[[cur]], collapse = "\t"))
+      writeLines(curr, con = conn)
     }
-close(conn)
-}
+    close(conn)
+  }
 
-return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles))
+  return(list(GMTfiles = gmtFiles, NodeStyles = nodeAttrFiles))
 }
 
 #' get the integrated patient similarity network made of selected features
@@ -214,8 +221,10 @@ return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles))
 #' same class, relative to those of other classes, using Dijkstra distance (calcShortestPath flag).  
 #' @param dat (MultiAssayExperiment) input data
 #' @param groupList (list) feature groups, identical to groupList provided for buildPredictor()
-#' @param makeNets (function) Function used to create patient similarity networks. Identical to 
+#' @param makeNetFunc (function) Function used to create patient similarity networks. Identical to 
 #' makeNets provided to buildPredictor()
+#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc. See buildPredictor() 
+#' for details.
 #' @param selectedFeatures (list) selected features for each class (key of list). This object is returned as
 #' part of a call to getResults(), after running buildPredictor().
 #' @param plotCytoscape (logical) If TRUE, plots network in Cytoscape.
@@ -245,35 +254,50 @@ return(list(GMTfiles=gmtFiles,NodeStyles=nodeAttrFiles))
 #' colours (colour)
 #' 6) outDir (char) value of outDir parameter
 #' @export
-getPSN <- function(dat, groupList, makeNets, selectedFeatures, plotCytoscape=FALSE,
-    aggFun="MEAN", prune_pctX=0.30, prune_useTop=TRUE,numCores=1L,calcShortestPath=FALSE
-    ){
-topPath <- gsub(".profile","", unique(unlist(selectedFeatures)))
-topPath <- gsub("_cont.txt","",topPath)
-
-## create groupList limited to top features
-g2 <- list();
-for (nm in names(groupList)) {
-	cur <- groupList[[nm]]
-	idx <- which(names(cur) %in% topPath)
-	message(sprintf("%s: %i features", nm, length(idx)))
-	if (length(idx)>0) g2[[nm]] <- cur[idx]
-}
+getPSN <- function(dat, groupList,
+    makeNetFunc = NULL, sims = NULL,
+    selectedFeatures, plotCytoscape = FALSE,
+    aggFun = "MEAN", prune_pctX = 0.30, prune_useTop = TRUE,
+    numCores = 1L, calcShortestPath = FALSE
+    ) {
 
-message("* Making integrated PSN")
-psn <- 
-   plotIntegratedPatientNetwork(dat,
-  groupList=g2, makeNetFunc=makeNets,
-  aggFun=aggFun,
-  prune_pctX=prune_pctX,
-  prune_useTop=prune_useTop,
-  numCores=numCores,
-  calcShortestPath=calcShortestPath,
-  showStats=FALSE,
-  verbose=TRUE, 
-  plotCytoscape=plotCytoscape)
-
-return(psn)
+
+  # checks either/or provided, sets missing var to NULL
+  x <- checkMakeNetFuncSims(makeNetFunc = makeNetFunc,
+    sims = sims, groupList = groupList)
+
+  topPath <- gsub(".profile", "", unique(unlist(selectedFeatures)))
+  topPath <- gsub("_cont.txt", "", topPath)
+
+  ## create groupList limited to top features
+  g2 <- list()
+  s2 <- list()
+  for (nm in names(groupList)) {
+    cur <- groupList[[nm]]
+    idx <- which(names(cur) %in% topPath)
+    message(sprintf("%s: %i features", nm, length(idx)))
+    if (length(idx) > 0) {
+      g2[[nm]] <- cur[idx]
+      s2[[nm]] <- sims[[nm]]
+    }
+  }
+
+  message("* Making integrated PSN")
+  psn <-
+   plotIntegratedPatientNetwork(
+       dataList = dat,
+  groupList = g2, makeNetFunc = makeNetFunc,
+  sims = s2,
+  aggFun = aggFun,
+  prune_pctX = prune_pctX,
+  prune_useTop = prune_useTop,
+  numCores = numCores,
+  calcShortestPath = calcShortestPath,
+  showStats = FALSE,
+  verbose = TRUE,
+  plotCytoscape = plotCytoscape)
+
+  return(psn)
 }
 
 #' Make confusion matrix
@@ -291,37 +315,38 @@ return(psn)
 #' @importFrom plotrix color2D.matplot
 #' @export
 confusionMatrix <- function(model) {
-    nmList <- names(model)[grep("Split",names(model))]
-    cl <- sort(unique(model$Split1$STATUS))
-    conf <- list()
-    mega <- NULL
-    for (nm in nmList){
-        pred <- model[[nm]][["predictions"]][,c("ID","STATUS","TT_STATUS","PRED_CLASS")]
-        m <- as.matrix(table(pred[,c("STATUS","PRED_CLASS")]))
-        conf[[nm]] <- m/colSums(m)
-        if (is.null(mega)) mega <- conf[[nm]] else mega <- mega + conf[[nm]]
-    }
-    
-        mega <- mega / length(conf) # average
-        mega <- round(mega*100,2)
-        mega <- t(mega)
-        metric <- "%% Accuracy"
-        
-        tbl <- table(model$Split1$predictions$STATUS)
-        nm <- names(tbl); val <- as.integer(tbl)
-        ttl <- sprintf("%s\n(N=%i)",rownames(mega),val[match(rownames(mega),nm)])
-
-    par(mar=c(4,8,2,2))
-    color2D.matplot(mega,show.values=TRUE, border="white", 
-        #cs1=c(1,1,1),cs2=c(1,0.5,0),cs3=c(0,0.5,0), 
-        extremes=c(1,2),
-        axes=FALSE,        
-        xlab="Predicted class",ylab="")
-    axis(1,at=seq_len(ncol(mega))-0.5,labels=colnames(mega))
-    axis(2,at=seq_len(ncol(mega))-0.5,labels=rev(ttl),las=2)
-    title(sprintf("Confusion matrix: Accuracy (avg of %i splits)",length(conf)))
-
-    return(list(splitWiseConfMatrix=conf, average=mega))
+  nmList <- names(model)[grep("Split", names(model))]
+  cl <- sort(unique(model$Split1$STATUS))
+  conf <- list()
+  mega <- NULL
+  for (nm in nmList) {
+    pred <- model[[nm]][["predictions"]][, c("ID", "STATUS", "TT_STATUS", "PRED_CLASS")]
+    m <- as.matrix(table(pred[, c("STATUS", "PRED_CLASS")]))
+    conf[[nm]] <- m / colSums(m)
+    if (is.null(mega)) mega <- conf[[nm]] else mega <- mega + conf[[nm]]
+  }
+
+  mega <- mega / length(conf) # average
+  mega <- round(mega * 100, 2)
+  mega <- t(mega)
+  metric <- "%% Accuracy"
+
+  tbl <- table(model$Split1$predictions$STATUS)
+  nm <- names(tbl)
+  val <- as.integer(tbl)
+  ttl <- sprintf("%s\n(N=%i)", rownames(mega), val[match(rownames(mega), nm)])
+
+  par(mar = c(4, 8, 2, 2))
+  color2D.matplot(mega, show.values = TRUE, border = "white",
+  #cs1=c(1,1,1),cs2=c(1,0.5,0),cs3=c(0,0.5,0), 
+        extremes = c(1, 2),
+        axes = FALSE,
+        xlab = "Predicted class", ylab = "")
+  axis(1, at = seq_len(ncol(mega)) - 0.5, labels = colnames(mega))
+  axis(2, at = seq_len(ncol(mega)) - 0.5, labels = rev(ttl), las = 2)
+  title(sprintf("Confusion matrix: Accuracy (avg of %i splits)", length(conf)))
+
+  return(list(splitWiseConfMatrix = conf, average = mega))
 }
 
 #' Plot tSNE
@@ -331,7 +356,7 @@ confusionMatrix <- function(model) {
 #' matrix (symmetric). Row and column names are patient IDs. Note that NA
 #' values will be replaced by very small number (effectively zero).
 #' @param pheno (data.frame) Patient labels. ID column is patient ID and 
-#' STATUS is patient label of interest. tSNE will colour-code nodes by 
+#' STATUS is patient label xof interest. tSNE will colour-code nodes by 
 #' patient label.
 #' @param ... Parameters for Rtsne() function.
 #' @return (Rtsne) output of Rtsne call. Side effect of tSNE plot
@@ -348,31 +373,31 @@ confusionMatrix <- function(model) {
 #' pheno <- data.frame(ID=pid,STATUS=c(rep("control",50),rep("case",50)))
 #' tSNEPlotter(psn2,pheno)
 #' @export
-tSNEPlotter <- function(psn,pheno,...) {
-
-message("* Making symmetric matrix")
-symmForm <- suppressMessages(makeSymmetric(psn))
-symmForm[which(is.na(symmForm))] <- .Machine$double.eps
-message("* Running tSNE")
-x <- Rtsne(symmForm,...)
-dat <- x$Y
-samps <- rownames(symmForm)
-idx <- match(samps, pheno$ID)
-if (all.equal(pheno$ID[idx],samps)!=TRUE) {
-	stop("pheno IDs not matching psn rownames")
-}
-st <- pheno$STATUS[idx]
+tSNEPlotter <- function(psn, pheno, ...) {
+
+  message("* Making symmetric matrix")
+  symmForm <- suppressMessages(makeSymmetric(psn))
+  symmForm[which(is.na(symmForm))] <- .Machine$double.eps
+  message("* Running tSNE")
+  x <- Rtsne(symmForm, ...)
+  dat <- x$Y
+  samps <- rownames(symmForm)
+  idx <- match(samps, pheno$ID)
+  if (all.equal(pheno$ID[idx], samps) != TRUE) {
+    stop("pheno IDs not matching psn rownames")
+  }
+  st <- pheno$STATUS[idx]
 
-# to eliminate the "no visible binding for global variable" problem
-y <- status <- NULL
+  # to eliminate the "no visible binding for global variable" problem
+  y <- status <- NULL
 
-message("* Plotting")
-colnames(dat) <- c("x","y")
-dat <- as.data.frame(dat,stringsAsFactors=TRUE)
-dat$status <- as.factor(st)
-p <- ggplot2::ggplot(dat,aes(x,y)) + geom_point(aes(colour=status))
-p <- p + xlab("") + ylab("") + ggtitle("Integrated PSN - tSNE")
-print(p)
+  message("* Plotting")
+  colnames(dat) <- c("x", "y")
+  dat <- as.data.frame(dat, stringsAsFactors = TRUE)
+  dat$status <- as.factor(st)
+  p <- ggplot2::ggplot(dat, aes(x, y)) + geom_point(aes(colour = status))
+  p <- p + xlab("") + ylab("") + ggtitle("Integrated PSN - tSNE")
+  print(p)
 
-return(x)
+  return(x)
 }
\ No newline at end of file
diff --git a/R/makePSN_NamedMatrix.R b/R/makePSN_NamedMatrix.R
index 97711c37..18fbe832 100644
--- a/R/makePSN_NamedMatrix.R
+++ b/R/makePSN_NamedMatrix.R
@@ -101,28 +101,31 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir = tempdir(),
         if (verbose) 
             message(sprintf("%i members", length(idx)))
         
-        oFile <- NULL
+      oFile <- NULL
+        
         # has sufficient connections to make network
         if (length(idx) >= minMembers) {
             if (writeProfiles) {
-                outFile <- paste(outDir,sprintf("%s.profile",curSet),
-					sep=getFileSep())
-                write.table(t(xpr[idx, , drop = FALSE]), file = outFile, 
-				sep = "\t",dec=".",
-                  col.names = FALSE, row.names = TRUE, quote = FALSE)
+              outFile <- paste(outDir, sprintf("%s.profile", curSet),
+                sep=getFileSep())
+              write.table(t(xpr[idx,, drop = FALSE]), file = outFile,
+                sep = "\t", dec = ".",
+                col.names = FALSE, row.names = TRUE, quote = FALSE)
+
             } else {
-                outFile <- paste(outDir,sprintf("%s_cont.txt", curSet),
-			sep=getFileSep())
+              outFile <- paste(outDir, sprintf("%s_cont.txt", curSet),
+                sep=getFileSep())
                 message(sprintf("computing sim for %s", curSet))
-                sim <- getSimilarity(xpr[idx, , drop = FALSE], 
-			type = simMetric, 
-                  ...)
+              sim <- getSimilarity(xpr[idx,, drop = FALSE],
+                type = simMetric,
+                ...)
                 if (is.null(sim)) {
-                  stop(sprintf(paste("makePSN_NamedMatrix:%s: ", 
-		"similarity matrix is empty (NULL).\n", 
-                "Check that there isn't a mistake in the ", 
-		"input data or similarity method of choice.\n", 
-                    sep = ""), curSet))
+                  stop(sprintf(paste("makePSN_NamedMatrix:%s: ",
+                  "similarity matrix is empty (NULL).\n",
+                  "Check that there isn't a mistake in the ",
+                  "input data or similarity method of choice.\n",
+                  sep = ""),
+                  curSet))
                 }
                 pat_pairs <- sim
                 
@@ -149,8 +152,8 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir = tempdir(),
                     })
                   }
                 } else {
-                  write.table(pat_pairs, file = outFile, sep = "\t", 
-					col.names = FALSE,
+                  write.table(pat_pairs, file = outFile, sep = "\t",
+                  col.names = FALSE,
                 	row.names = FALSE, quote = FALSE)
                   print(basename(outFile))
                   message("done")
diff --git a/R/plotEmap.R b/R/plotEmap.R
index 2e1ff6bc..450a68f8 100644
--- a/R/plotEmap.R
+++ b/R/plotEmap.R
@@ -55,98 +55,109 @@
 #' nodeAttrFile <- EMap_input[[1]][2]
 #' 
 #' # not run because requires Cytoscape to be installed and open
-#' # plotEmap(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, 
+#' # viewSelectedFeaturesAsNetworks(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, 
 #' #\t\tnetName='HighRisk')
-#' @return No value. Side effect of plotting the EnrichmentMap in an open 
+#' @return No value. Side effect of plotting the network view for features in an open 
 #' session of Cytoscape.
 #' @export
-plotEmap <- function(gmtFile, nodeAttrFile, netName = "generic", 
-	  scoreCol="maxScore",
-		minScore = 1, maxScore = 10, nodeFillStops=c(7,9),
-    colorScheme = "cont_heatmap", imageFormat = "png", verbose = FALSE, 
-		createStyle = TRUE, 
-    groupClusters = FALSE, hideNodeLabels=FALSE) {
+viewSelectedFeaturesAsNetworks <- function(gmtFile, nodeAttrFile, netName = "generic",
+    scoreCol = "maxScore",
+    minScore = 1, maxScore = 10, nodeFillStops = c(7, 9),
+    colorScheme = "cont_heatmap", imageFormat = "png", verbose = FALSE,
+    createStyle = TRUE,
+    groupClusters = FALSE, hideNodeLabels = FALSE) {
 
-  if (!requireNamespace("RCy3",quietly=TRUE)) {
-		stop("Package \"RCy3\" needed for plotEmap() to work. Please install it and then make your call.",
-		call.=FALSE)
-	}
-    
-    validColSchemes <- c("cont_heatmap", "netDx_ms")
-    if (!colorScheme %in% validColSchemes) {
-        stop(sprintf("colorScheme should be one of { %s }\n", 
-					paste(validColSchemes, 
+  if (!requireNamespace("RCy3", quietly = TRUE)) {
+    stop("Package \"RCy3\" needed for plotEmap() to work. Please install it and then make your call.",
+    call. = FALSE)
+  }
+
+  tryCatch({
+    RCy3::cytoscapePing()
+  }, error = function(ex) {
+    stop("Error while trying to ping Cytoscape. Are you sure you have Cytoscape installed and currently running?")
+  }, finally = {
+  })
+
+  validColSchemes <- c("cont_heatmap", "netDx_ms")
+  if (!colorScheme %in% validColSchemes) {
+    stop(sprintf("colorScheme should be one of { %s }\n",
+          paste(validColSchemes,
             collapse = ",")))
-    }
-    
-    ####################################### create EM using given parameters
-    if (netName %in% getNetworkList()) {
-        RCy3::deleteNetwork(netName)
-    }
-    em_command <- paste("enrichmentmap build analysisType=\"generic\"", 
-				"gmtFile=", gmtFile, "pvalue=", 1, "qvalue=", 1, 
-				"similaritycutoff=", 0.05, "coefficients=", "JACCARD")
-    response <- RCy3::commandsGET(em_command)
-    renameNetwork(netName, getNetworkSuid())
-    
-    ### #annotate the network using AutoAnnotate app
-    aa_command <- paste("autoannotate annotate-clusterBoosted", 
-				"clusterAlgorithm=MCL", 
+  }
+
+  ####################################### create EM using given parameters
+  if (netName %in% RCy3::getNetworkList()) {
+    RCy3::deleteNetwork(netName)
+  }
+
+  em_command <- paste("enrichmentmap build analysisType=\"generic\"",
+        "gmtFile=", gmtFile, "pvalue=", 1, "qvalue=", 1,
+        "similaritycutoff=", 0.05, "coefficients=", "JACCARD")
+  response <- RCy3::commandsGET(em_command)
+  RCy3::renameNetwork(netName, RCy3::getNetworkSuid())
+
+  ### #annotate the network using AutoAnnotate app
+  aa_command <- paste("autoannotate annotate-clusterBoosted",
+        "clusterAlgorithm=MCL",
         "labelColumn=name", "maxWords=3", "network=", netName)
-    print(aa_command)
-    response <- RCy3::commandsGET(aa_command)
-    
-    message("* Importing node attributes\n")
-    table_command <- sprintf(paste("table import file file=%s ", 
-				"keyColumnIndex=1 ", 
-        "firstRowAsColumnNames=true startLoadRow=1 TargetNetworkList=%s ", 
-				"WhereImportTable=To%%20selected%%20networks%%20only", 
-        sep = " "), nodeAttrFile, netName)
-    response <- RCy3::commandsGET(table_command)
-    
-    # apply style
-    message("* Creating or applying style\n")
-    all_unique_scores_int <- sort(unique(read.delim(nodeAttrFile)[, 2]))
-    all_unique_scores <- unlist(lapply(all_unique_scores_int, toString))
-    styleName <- "EMapStyle"
-    
-    # define colourmap
-    scoreVals <- minScore:maxScore
-    style_cols <- ""
-    if (colorScheme == "cont_heatmap") {
-        colfunc <- colorRampPalette(c("yellow", "red"))
-        gradient_cols <- colfunc(length(scoreVals))
-        style_cols <- colfunc(length(scoreVals))
-    } else if (colorScheme == "netDx_ms") {
-        style_cols <- rep("white", length(scoreVals))
-        style_cols[which(scoreVals >= nodeFillStops[1])] <- "orange"
-        style_cols[which(scoreVals >= nodeFillStops[2])] <- "red"
-    }
-    nodeLabels <- RCy3::mapVisualProperty("node label", "name", "p")
-    nodeFills <- RCy3::mapVisualProperty("node fill color", scoreCol, "d", 
-				scoreVals, style_cols)
-    defaults <- list(NODE_SHAPE = "ellipse", NODE_SIZE = 30, 
-				EDGE_TRANSPARENCY = 200, 
-        NODE_TRANSPARENCY = 255, EDGE_STROKE_UNSELECTED_PAINT = "#999999")
-    if (createStyle) {
-        message("Making style\n")
-        RCy3::createVisualStyle(styleName, defaults, list(nodeLabels, nodeFills))
-    }
-    RCy3::setVisualStyle(styleName)
-    if (groupClusters) {
-        RCy3::layoutNetwork("attributes-layout NodeAttribute=__mclCLuster")
-        redraw_command <- sprintf("autoannotate redraw network=%s", 
-					RCy3::getNetworkSuid())
-        response <- RCy3::commandsGET(redraw_command)
-        RCy3::fitContent()
-        
-        redraw_command <- sprintf("autoannotate redraw network=%s", 
-					RCy3::getNetworkSuid())
-        response <- RCy3::commandsGET(redraw_command)
-        RCy3::fitContent()
-    }
+  #print(aa_command)
+  response <- RCy3::commandsGET(aa_command)
+
+  message("* Importing node attributes\n")
+  attrs <- read.delim(nodeAttrFile, header = T, as.is = T)
+  RCy3::loadTableData(attrs, data.key.column = "netName")
+
+  # apply style
+  message("* Creating or applying style\n")
+  all_unique_scores_int <- sort(unique(read.delim(nodeAttrFile)[, 2]))
+  all_unique_scores <- unlist(lapply(all_unique_scores_int, toString))
+  styleName <- "EMapStyle"
+
+  # define colourmap
+  scoreVals <- minScore:maxScore
+  style_cols <- ""
+  if (colorScheme == "cont_heatmap") {
+    colfunc <- colorRampPalette(c("yellow", "red"))
+    gradient_cols <- colfunc(length(scoreVals))
+    style_cols <- colfunc(length(scoreVals))
+  } else if (colorScheme == "netDx_ms") {
+    style_cols <- rep("white", length(scoreVals))
+    style_cols[which(scoreVals >= nodeFillStops[1])] <- "orange"
+    style_cols[which(scoreVals >= nodeFillStops[2])] <- "red"
+  }
+
+  nodeLabels <- RCy3::mapVisualProperty("node label", "name", "p")
+  nodeFills <- RCy3::mapVisualProperty("node fill color", scoreCol, "d",
+        scoreVals, style_cols)
+  defaults <- list(
+    "node shape" = "ellipse",
+    "node size" = 30,
+    "edge transparency" = 200,
+    "node transparency" = 255,
+    "edge stroke unselected paint" = "#999999"
+    )
+  if (createStyle) {
+    message("\tCreating style\n")
+    RCy3::createVisualStyle(styleName, defaults, list(nodeLabels, nodeFills))
+  }
+  RCy3::setVisualStyle(styleName)
+  if (groupClusters) {
+    RCy3::layoutNetwork("attributes-layout NodeAttribute=__mclCLuster")
+    redraw_command <- sprintf("autoannotate redraw network=%s",
+          RCy3::getNetworkSuid())
+    response <- RCy3::commandsGET(redraw_command)
+    RCy3::fitContent()
+
+    redraw_command <- sprintf("autoannotate redraw network=%s",
+          RCy3::getNetworkSuid())
+    response <- RCy3::commandsGET(redraw_command)
+    RCy3::fitContent()
+    RCy3::fitContent()
+  }
 
-		if (hideNodeLabels) {
-			RCy3::setNodeFontSizeDefault(0,styleName)
-	}
+  if (hideNodeLabels) {
+    RCy3::setNodeFontSizeDefault(0, styleName)
+    RCy3::fitContent()
+  }
 }
diff --git a/R/plotIntegratedPatientNetwork.R b/R/plotIntegratedPatientNetwork.R
index cd9ed4c4..4cb86750 100644
--- a/R/plotIntegratedPatientNetwork.R
+++ b/R/plotIntegratedPatientNetwork.R
@@ -19,6 +19,7 @@
 #' list of lists, where the outer list corresponds to assay (e.g. mRNA,
 #' clinical) and inner list to features to generate from that datatype.
 #' @param makeNetFunc (function) function to create features
+#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc
 #' @param setName (char) name to assign the network in Cytoscape
 #' @param numCores (integer) number of cores for parallel processing
 #' @param prune_pctX (numeric between 0 and 1) fraction of most/least 
@@ -59,7 +60,8 @@
 #' @importFrom RColorBrewer brewer.pal
 #' @importFrom stats wilcox.test qexp density
 #' @export
-plotIntegratedPatientNetwork <- function(dataList,groupList,makeNetFunc,
+plotIntegratedPatientNetwork <- function(dataList,groupList,
+	makeNetFunc=NULL,sims=NULL,
 	setName="predictor",prune_pctX=0.05, prune_useTop=TRUE,
 	 aggFun="MAX",calcShortestPath=FALSE,
 	showStats=FALSE,
@@ -67,6 +69,11 @@ plotIntegratedPatientNetwork <- function(dataList,groupList,makeNetFunc,
 	nodeTransparency=155L,plotCytoscape=FALSE,
 	verbose=FALSE) {
 
+
+# checks either/or provided, sets missing var to NULL
+checkMakeNetFuncSims(makeNetFunc=makeNetFunc, 
+    sims=sims,groupList=groupList)
+
 if (missing(dataList)) stop("dataList is missing.")
 
 dat <- dataList2List(dataList, groupList)
@@ -81,7 +88,9 @@ pheno_id <- setupFeatureDB(pheno,outDir)
 
 createPSN_MultiData(dataList=dat$assays,groupList=groupList,
 			pheno=pheno_id,
-			netDir=outDir,customFunc=makeNetFunc,numCores=numCores,
+			netDir=outDir,
+			makeNetFunc=makeNetFunc,sims=sims,
+			numCores=numCores,
 			verbose=FALSE)
 convertProfileToNetworks(
 		netDir=profDir,
diff --git a/R/plotPerf.R b/R/plotPerf.R
index 632c1d60..3f935022 100644
--- a/R/plotPerf.R
+++ b/R/plotPerf.R
@@ -11,6 +11,7 @@
 #' @param predClasses (char) vector of class names.
 #' @param plotSEM (logical) metric for error bars. If set to TRUE, plots SEM;
 #' else plots SD. 
+#' @param drawPlot (logical) If TRUE, draws AUROC and AUPR curves.
 #' @return (list) each key corresponds to an input file in inDir.
 #' Value is a list with:
 #' 1) stats: 'stats' component of perfCalc
@@ -39,115 +40,120 @@
 #' @importFrom stats sd
 #' @importFrom graphics abline axis par points segments text title hist
 #' @export
-plotPerf <- function(resList=NULL, inFiles, predClasses,plotSEM=FALSE) {
-	if (is.null(resList)) {
-    	if (missing(inFiles)) 
-        	stop("inDir not provided")
-	}
-    if (missing(predClasses)) 
-        stop("predClasses missing; please specify classes")
-    
-    # given output of performance('precall') compute AUC-PR
-    prauc <- function(dat) {
-        x <- dat@x.values[[1]]  # recall
-        y <- dat@y.values[[1]]  # precision
-        
-        # remove NAN
-        idx <- which(is.nan(y))
-        if (any(idx)) {
-            x <- x[-idx]
-            y <- y[-idx]
-        }
-        
-        pracma::trapz(x, y)
+plotPerf <- function(resList = NULL, inFiles, predClasses, plotSEM = FALSE, drawPlot = TRUE) {
+  if (is.null(resList)) {
+    if (missing(inFiles))
+      stop("inDir not provided")
+  }
+  if (missing(predClasses))
+    stop("predClasses missing; please specify classes")
+
+  # given output of performance('precall') compute AUC-PR
+  prauc <- function(dat) {
+    x <- dat@x.values[[1]] # recall
+    y <- dat@y.values[[1]] # precision
+
+    # remove NAN
+    idx <- which(is.nan(y))
+    if (any(idx)) {
+      x <- x[-idx]
+      y <- y[-idx]
     }
-    
-	if (is.null(resList)) {
-		resList <- list(); ctr <- 1
-	    for (fName in inFiles) {
-	        resList[[ctr]] <- read.delim(fName, 
-				sep = "\t", header = TRUE, as.is = TRUE)
-			ctr <- ctr+1
-		}
-	}
-    
-   	mega <- list()
-	for (ctr  in seq_len(length(resList))) {
-		dat <- resList[[ctr]]
-        out <- list()
-        overall_acc <- numeric()
-        curRoc <- list()
-        curPr <- list()
-        
-        pred_col1 <- sprintf("%s_SCORE", predClasses[1])
-        pred_col2 <- sprintf("%s_SCORE", predClasses[2])
-        
-        idx1 <- which(colnames(dat) == pred_col1)
-        idx2 <- which(colnames(dat) == pred_col2)
-        pred <- ROCR::prediction(dat[, idx1] - dat[, idx2], 
-						dat$STATUS == predClasses[1])
-        
-        c1 <- predClasses[1]  #numc[1]
-        tp <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS == c1)
-        tn <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS != c1)
-        fp <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS != c1)
-        fn <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS == c1)
-        
-        # entire curves
-        curRoc <- ROCR::performance(pred, "tpr", "fpr")
-        curPr <- ROCR::performance(pred, "prec", "rec")
-        tmp <- data.frame(score = 0, tp = tp, tn = tn, fp = fp, fn = fn)
-        out <- perfCalc(tmp)
-        
-        # statistic
-        auroc <- performance(pred, "auc")@y.values[[1]]
-        aupr <- prauc(curPr)
-        corr <- sum(dat$STATUS == dat$PRED_CLASS)
-        overall_acc <- c(overall_acc, corr/nrow(dat) * 100)
-        
-        ### TODO put in F1.
-        mega[[ctr]] <- list(stats = out$stats, roc_curve = curRoc, 
-						pr_curve = curPr, 
-            auroc = auroc, aupr = aupr, accuracy = overall_acc)
+
+    pracma::trapz(x, y)
+  }
+
+  if (is.null(resList)) {
+    resList <- list();
+    ctr <- 1
+    for (fName in inFiles) {
+      resList[[ctr]] <- read.delim(fName,
+        sep = "\t", header = TRUE, as.is = TRUE)
+      ctr <- ctr + 1
     }
-    
-    .plotAvg <- function(res, name,plotSEM) {
-        mu <- mean(res, na.rm = TRUE)
-		if (plotSEM) {
-        	err <- sd(res, na.rm = TRUE)/sqrt(length(res))
-			errnm <- "SEM"
-		} else {
-        	err <- sd(res, na.rm = TRUE)
-			errnm <- "SD"
-}
-        plot(1, mu, type = "n", bty = "n", 
-						ylab = sprintf("%s (mean+/-%s)", name,errnm), 
-						xaxt = "n", ylim = c(0.4, 1), las = 1, 
-								xlim = c(0.8,1.2), 
-								cex.axis = 1.4, xlab = "")
-        abline(h = c(0.7, 0.8), col = "cadetblue3", lty = 3, lwd = 3)
-        points(1, mu, type = "p", cex = 1.4, pch = 16)
-        
-        # error bars
-        segments(x0 = 1, y0 = mu - err, y1 = mu + err, lwd = 3)
-        segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu - err, y1 = mu - err)
-        segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu + err, y1 = mu + err)
-        abline(h = 0.5, col = "red", lty = 1, lwd = 2)
-        title(sprintf("%s: N=%i runs", name, length(res)))
+  }
+
+  mega <- list()
+  for (ctr in seq_len(length(resList))) {
+    dat <- resList[[ctr]]
+    out <- list()
+    overall_acc <- numeric()
+    curRoc <- list()
+    curPr <- list()
+
+    pred_col1 <- sprintf("%s_SCORE", predClasses[1])
+    pred_col2 <- sprintf("%s_SCORE", predClasses[2])
+
+    idx1 <- which(colnames(dat) == pred_col1)
+    idx2 <- which(colnames(dat) == pred_col2)
+    pred <- ROCR::prediction(dat[, idx1] - dat[, idx2],
+            dat$STATUS == predClasses[1])
+
+    c1 <- predClasses[1] #numc[1]
+    tp <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS == c1)
+    tn <- sum(dat$STATUS == dat$PRED_CLASS & dat$STATUS != c1)
+    fp <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS != c1)
+    fn <- sum(dat$STATUS != dat$PRED_CLASS & dat$STATUS == c1)
+
+    # entire curves
+    curRoc <- ROCR::performance(pred, "tpr", "fpr")
+    curPr <- ROCR::performance(pred, "prec", "rec")
+    tmp <- data.frame(score = 0, tp = tp, tn = tn, fp = fp, fn = fn)
+    out <- perfCalc(tmp)
+
+    # statistic
+    auroc <- performance(pred, "auc")@y.values[[1]]
+    aupr <- prauc(curPr)
+    corr <- sum(dat$STATUS == dat$PRED_CLASS)
+    overall_acc <- c(overall_acc, corr / nrow(dat) * 100)
+
+    ### TODO put in F1.
+    mega[[ctr]] <- list(stats = out$stats, roc_curve = curRoc,
+            pr_curve = curPr,
+            auroc = auroc, aupr = aupr, accuracy = overall_acc)
+  }
+
+  .plotAvg <- function(res, name, plotSEM) {
+    mu <- mean(res, na.rm = TRUE)
+    if (plotSEM) {
+      err <- sd(res, na.rm = TRUE) / sqrt(length(res))
+      errnm <- "SEM"
+    } else {
+      err <- sd(res, na.rm = TRUE)
+      errnm <- "SD"
     }
-    
-    # plot average +/-error
+    plot(1, mu, type = "n", bty = "n",
+            ylab = sprintf("%s (mean+/-%s)", name, errnm),
+            xaxt = "n", ylim = c(0.4, 1), las = 1,
+                xlim = c(0.8, 1.2),
+                cex.axis = 1.4, xlab = "")
+    abline(h = c(0.7, 0.8), col = "cadetblue3", lty = 3, lwd = 3)
+    points(1, mu, type = "p", cex = 1.4, pch = 16)
+
+    # error bars
+    segments(x0 = 1, y0 = mu - err, y1 = mu + err, lwd = 3)
+    segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu - err, y1 = mu - err)
+    segments(x0 = 1 - 0.01, x1 = 1 + 0.01, y0 = mu + err, y1 = mu + err)
+    abline(h = 0.5, col = "red", lty = 1, lwd = 2)
+    title(sprintf("%s: N=%i runs", name, length(res)))
+  }
+
+  # plot average +/-error
+  if (drawPlot) {
     par(mfrow = c(2, 2))
     x <- unlist(lapply(mega, function(x) x$auroc))
-    .plotAvg(x, "AUROC",plotSEM)
+    .plotAvg(x, "AUROC", plotSEM)
     x <- unlist(lapply(mega, function(x) x$aupr))
-    .plotAvg(x, "AUPR",plotSEM)
-    
-    # plot individual curves
+    .plotAvg(x, "AUPR", plotSEM)
+  }
+
+  # plot individual curves
+  if (drawPlot) {
     rocCurves <- lapply(mega, function(x) x$roc_curve)
     plotPerf_multi(rocCurves, "ROC")
     prCurves <- lapply(mega, function(x) x$pr_curve)
     plotPerf_multi(prCurves, "PR", plotType = "PR")
-    
-    return(mega)
+  }
+
+  return(mega)
 }
diff --git a/R/predict.R b/R/predict.R
index 4fd8ee0a..0df38814 100644
--- a/R/predict.R
+++ b/R/predict.R
@@ -5,12 +5,12 @@
 #' @param testMAE (MultiAssayExperiment) new patient dataset for testing model. Assays must be the same as for trainMAE.
 #' @param groupList (list) list of features used to train the model. Keys are data types, and values are lists for groupings within those datatypes.
 #' e.g. keys could include {'clinical','rna','methylation'}, and values within 'rna' could include pathway names {'cell cycle', 'DNA repair'}, etc.,
-#' featSel will be used to subset 
-#' @param featSel (list) selected features to be used in the predictive model. 
+#' selectedFeatures will be used to subset 
+#' @param selectedFeatures (list) selected features to be used in the predictive model. 
 #' keys are patient labels (e.g. "responder/nonresponder"), and values are feature names 
 #' identified by running buildPredictor(). Feature names must correspond to names of groupList, from which they will be subset.
 #' @param makeNetFunc (function) function to create PSN features from patient data. See makeNetFunc in buildPredictor() for details
-#' @param impute (logical) if TRUE imputes train and test samples separately before creating features. Currently unsupported.
+#' @param sims (list) rules for creating PSN. Preferred over makeNetFunc.
 #' @param outDir (char) directory for results
 #' @param verbose (logical) print messages
 #' @param numCores (integer) number of CPU cores for parallel processing
@@ -20,19 +20,20 @@
 #' columns are:  1) ID, 2) STATUS (ground truth), 3) <label>_SCORE: similarity score for the corresponding label,
 #' 4) PRED_CLASS: predicted class
 #' @export
-predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
-    impute = FALSE, verbose = FALSE, numCores = 1L, JavaMemory = 4L, debugMode = FALSE) {
+predict <- function(trainMAE, testMAE, groupList, 
+  selectedFeatures, 
+  makeNetFunc=NULL, sims=NULL,
+  outDir, verbose = FALSE, 
+  numCores = 1L, JavaMemory = 4L, debugMode = FALSE) {
 
   # input checks
   if (missing(trainMAE)) stop("trainMAE must be supplied.\n")
   if (missing(testMAE)) stop("testMAE must be supplied.\n")
   if (missing(groupList)) stop("groupList must be supplied.\n")
   if (length(groupList) < 1) stop("groupList must be of length 1+\n")
-  if (class(featSel) != "list") stop("featSel must be a list with patient labels as keys, and selected features as values")
+  if (class(selectedFeatures) != "list") stop("selectedFeatures must be a list with patient labels as keys, and selected features as values")
   if (missing(outDir)) stop("outDir must be supplied.\n")
-  if (impute) stop("impute=TRUE is not supported in the current version of netDx. This will be implemented based on future user interest. Please contact Shraddha Pai <shraddha.pai@utoronto.ca> if this feature is required.")
 
-  nm1 <- setdiff(names(groupList), "clinical")
   if (!is(trainMAE, "MultiAssayExperiment"))
     stop("trainMAE must be a MultiAssayExperiment")
   if (!is(testMAE, "MultiAssayExperiment"))
@@ -41,6 +42,7 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
   tmp <- unlist(lapply(groupList, class))
   not_list <- sum(tmp == "list") < length(tmp)
 
+  nm1 <- setdiff(names(groupList), "clinical")
   names_nomatch <- any(!nm1 %in% names(trainMAE))
   if (!is(groupList, "list") || not_list || names_nomatch) {
     msg <- c("groupList must be a list of lists.",
@@ -49,11 +51,13 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
     stop(paste(msg, sep = ""))
   }
 
-  for (nm in names(featSel)) {
-    featSel[[nm]] <- sub("_cont.txt", "", sub(".profile", "", featSel[[nm]]))
+  for (nm in names(selectedFeatures)) {
+    selectedFeatures[[nm]] <- sub(
+      "_cont.txt", "", 
+      sub(".profile", "", selectedFeatures[[nm]]))
   }
   # clean features
-  fs <- unlist(featSel);
+  fs <- unlist(selectedFeatures);
   names(fs) <- NULL
   gl <- c()
   for (k in names(groupList)) {
@@ -62,7 +66,7 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
   }
 
   if (sum(!fs %in% gl) > 0) {
-    stop("One or more entry in featSelNet not found in groupList.")
+    stop("One or more entry in selectedFeatures was not found in groupList.")
   }
 
   # merging train-test for joint db
@@ -99,24 +103,38 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
   message("* Measuring similarity to each known class")
   subtypes <- unique(ph$STATUS)
   predRes <- list()
+
+  # classification for a given class is performed as follows:
+  # you take just the selected features for that class and create a single PSN comprised of the union
+  # of training and test samples.
+  # using training sampless for that class ("samples look like non-responders",e.g.) run label propagation
+  # on the PSN. get a similarity ranking for all test samples
+  # now repeat this exercise for all classes.
+  # ultimately, the patient is classified as the class to which they have greatest similarity.
   for (g in subtypes) {
     if (verbose) message(sprintf("\t%s", g))
     pDir <- paste(outDir, g, sep = getFileSep())
-    dir.create(pDir)
-
     netDir <- paste(pDir, "networks", sep = getFileSep())
+    dir.create(pDir)
     dir.create(netDir)
+
+    # make nets using only features selected for the label in question
     pheno_id <- setupFeatureDB(pheno, netDir)
 
+    # checks either/or provided, sets missing var to NULL
+    x <- checkMakeNetFuncSims(makeNetFunc=makeNetFunc, sims=sims,groupList=groupList)
+
     if (verbose) message("Creating PSN")
     createPSN_MultiData(dataList = assays,
         groupList = groupList,
         pheno = pheno_id,
         netDir = netDir,
-        customFunc = makeNetFunc,
+        makeNetFunc = makeNetFunc,
+        sims = sims,
         numCores = 1L,
-        filterSet = featSel[[g]],
+        filterSet = selectedFeatures[[g]],
         verbose = verbose)
+
     dbDir <- compileFeatures(netDir,
       outDir = pDir,
       numCores = numCores,
@@ -126,9 +144,10 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
     # run query for this class
     qSamps <- pheno$ID[which(pheno$STATUS %in% g & pheno$TT_STATUS %in% "TRAIN")]
     qFile <- paste(pDir, sprintf("%s_query", g), sep = getFileSep())
-
     message(sprintf("\t%s : %s training samples", g, prettyNum(length(qSamps), big.mark = ",")))
     writeQueryFile(qSamps, "all", nrow(pheno), qFile)
+
+
     if (verbose) message(sprintf("\t** %s: Compute similarity", g))
     resFile <- runQuery(dbDir$dbDir, qFile, resDir = pDir,
       JavaMemory = JavaMemory, numCores = numCores,
@@ -136,9 +155,18 @@ predict <- function(trainMAE, testMAE, groupList, featSel, makeNetFunc, outDir,
     predRes[[g]] <- getPatientRankings(sprintf("%s.PRANK", resFile), pheno, g)
   }
 
+  # at this point, we should have similarity rankings for each of the test patients, for each of the classes.
   predClass <- predictPatientLabels(predRes,
       verbose = verbose)
   out <- merge(x = pheno, y = predClass, by = "ID")
+  if (nrow(out)!= nrow(colData(testMAE))) {
+      warning(
+        paste(rep("*",25),
+          "Not all patients provided in the test sample were classified.",
+              rep("*",25), sep="\n")
+      )
+  }
+
   acc <- sum(out$STATUS == out$PRED_CLASS) / nrow(out)
   message(sprintf("%s test patients", prettyNum(nrow(out), big.mark = ",")))
   message(sprintf("ACCURACY (N=%i test) = %2.1f%%",
diff --git a/R/runQuery.R b/R/runQuery.R
index ad7f461d..18e7a8da 100644
--- a/R/runQuery.R
+++ b/R/runQuery.R
@@ -17,45 +17,54 @@
 #' queryFile <- system.file("extdata","GM_query.txt",package="netDx")
 #' runQuery(dbPath, queryFile,tempdir())
 #' @export
-runQuery <- function(dbPath, queryFiles, resDir, verbose = TRUE, 
-		JavaMemory = 6L, numCores = 1L,debugMode=FALSE) {
-    
-    GM_jar <- getGMjar_path()
-    qBase <- basename(queryFiles[[1]][1])
-    logFile <- paste(resDir,sprintf("%s.log",qBase))
-    queryStrings <- paste(queryFiles, collapse = " ")
+runQuery <- function(dbPath, queryFiles, resDir, verbose = TRUE,
+    JavaMemory = 6L, numCores = 1L, debugMode = FALSE) {
 
-	args <- c()
-	java_ver <- suppressWarnings(system2("java", 
-		args="--version",stdout=TRUE,stderr=NULL))
-	if (any(grep(" 11",java_ver)) || any(grep(" 12",java_ver)) || any(grep(" 13",java_ver)) || any(grep(" 14",java_ver)) || any(grep(" 16",java_ver))) {
-		if (verbose) message("Java 11 or later detected")
-	} else {
-		if (verbose) message("Java 8 detected")
-		args <- c(args,"-d64")
-	}
+  GM_jar <- getGMjar_path()
+  qBase <- basename(queryFiles[[1]][1])
+  logFile <- paste(resDir, sprintf("%s.log", qBase))
+  queryStrings <- paste(queryFiles, collapse = " ")
 
-    args <- c(args, sprintf("-Xmx%iG", JavaMemory * numCores), "-cp", GM_jar)
-    args <- c(args, "org.genemania.plugin.apps.QueryRunner")
-    args <- c(args, "--data", dbPath, "--in", "flat", "--out", "flat")
-    args <- c(args, "--threads", numCores, "--results", resDir, 
-			unlist(queryFiles))
-    args <- c(args, "--netdx-flag", "true")  #,'2>1','/dev/null')
-    
-    # file is not actually created - is already split in PRANK and 
-		# NRANK segments on
-    # GeneMANIA side
-    resFile <- paste(resDir,sprintf("%s-results.report.txt",qBase),
-		sep=getFileSep())
-    t0 <- Sys.time()
-	if (debugMode) {
-		message(sprintf("java %s",paste(args,collapse=" ")))
-    	system2("java", args, wait = TRUE)
-	} else {
-    	system2("java", args, wait = TRUE, stdout = NULL, stderr = NULL)
-	}
-    if (verbose) 
-        message(sprintf("QueryRunner time taken: %1.1f s", Sys.time() - t0))
-    Sys.sleep(3)
-    return(resFile)
+  args <- c()
+  java_ver <- suppressMessages(suppressWarnings(system2("java",
+    args = "--version", stdout = TRUE, stderr = NULL)))
+  if (any(grep(" 11", java_ver)) ||
+      any(grep(" 12", java_ver)) ||
+      any(grep(" 13", java_ver)) ||
+      any(grep(" 14", java_ver)) ||
+      any(grep(" 16", java_ver)) ||
+      any(grep(" 17", java_ver)) ||
+      any(grep(" 18", java_ver)) ||
+	    any(grep(" 19", java_ver)) ||
+	    any(grep(" 20", java_ver))) {
+    if (verbose) message("Java 11 or later detected")
+    args <- c("--illegal-access=permit") # needed for Java 9-16. Deprecated in Java 17)
+  } else {
+    if (verbose) message("Java 8 detected")
+    args <- c(args, "-d64")
+  }
+
+  args <- c(args, sprintf("-Xmx%iG", JavaMemory * numCores), "-cp", GM_jar)
+  args <- c(args, "org.genemania.plugin.apps.QueryRunner")
+  args <- c(args, "--data", dbPath, "--in", "flat", "--out", "flat")
+  args <- c(args, "--threads", numCores, "--results", resDir,
+      unlist(queryFiles))
+  args <- c(args, "--netdx-flag", "true") #,'2>1','/dev/null')
+
+  # file is not actually created - is already split in PRANK and 
+  # NRANK segments on
+  # GeneMANIA side
+  resFile <- paste(resDir, sprintf("%s-results.report.txt", qBase),
+    sep = getFileSep())
+  t0 <- Sys.time()
+  if (debugMode) {
+    message(sprintf("java %s", paste(args, collapse = " ")))
+    system2("java", args, wait = TRUE)
+  } else {
+    system2("java", args, wait = TRUE, stdout = NULL, stderr = NULL)
+  }
+  if (verbose)
+    message(sprintf("QueryRunner time taken: %1.1f s", Sys.time() - t0))
+  Sys.sleep(3)
+  return(resFile)
 }
diff --git a/R/similarities.R b/R/similarities.R
index b4c988e0..9042a599 100644
--- a/R/similarities.R
+++ b/R/similarities.R
@@ -42,8 +42,8 @@ sim.pearscale <- function(dat, K = 20, alpha = 0.5) {
         z1 <- ztrans(dat)
         euc <- as.matrix(dist(z1, method = "euclidean"))^(1/2)
     } else {
-        euc <- as.matrix(1 - cor(dat, method = "pearson", 
-					use = "pairwise.complete.obs"))
+      euc <- as.matrix(1 - cor(dat, method = "pearson",
+        use = "pairwise.complete.obs"))
     }
     N <- nrow(euc)
     euc <- (euc + t(euc))/2
@@ -52,8 +52,8 @@ sim.pearscale <- function(dat, K = 20, alpha = 0.5) {
     finiteMean <- function(x) {
         return(mean(x[is.finite(x)], na.rm = TRUE))
     }
-    means <- apply(sortedColumns[, seq_len(K) + 1], 1, finiteMean) 
-		means <- means + .Machine$double.eps
+  means <- apply(sortedColumns[, seq_len(K) + 1], 1, finiteMean)
+    means <- means + .Machine$double.eps
     avg <- function(x, y) {
         return((x + y)/2)
     }
@@ -206,3 +206,190 @@ avgNormDiff <- function(x) {
     sim <- sim/nrow(x)
     sim
 }
+
+#' built-in similarity functions
+#'
+allowedSims <- function(){
+  return(c("pearsonCorr","normDiff","avgNormDiff",
+        "sim.pearscale","sim.eucscale"))
+}
+
+
+
+#' checks if provided similarity functions are valid. Returns error if not
+#'
+#' @param sims (list) keys are layer names, values are functions or characters (names of built-in similarity functions)
+#' @return TRUE if all pass check. Else throws error.
+checkSimValid <- function(sims){
+    allowed <- allowedSims()
+    for (k in names(sims)){    
+        if (class(sims[[k]])!="function"){
+            if (class(sims[[k]])!="character"){
+                stop(paste("Invalid sims datatype. ",
+                    "sims entries must be functions or keywords (characters) ",
+                    "for built-in similarity functions.",sep=""))
+            } else {
+                if (!sims[[k]] %in% allowed){
+                    stop(paste(
+                            sprintf("sims[[%s]] has invalid similarity type:",k),
+                            sims[[k]],". ",
+                            "Allowed values are: {%s}",
+                            paste(allowed,collapse=",")))
+                }
+            }
+        }
+    }
+    return(TRUE)
+}
+
+#' internal test function to check validity of makeNetFunc and sims
+#'
+#' @details User must provide either makeNetFunc or sims. This function
+#' confirms this.
+#' @param makeNetFunc (function) makeNetFunc from buildPredictor()
+#' @param sims (list) sims from buildPredictor()
+#' @param groupList (list) groupList from buildPredictor()s
+#' @return (list) cleaned values for makeNetFunc and Sims
+checkMakeNetFuncSims <- function(makeNetFunc,sims,groupList){
+    if (is.null(makeNetFunc) && is.null(sims)) {
+	stop("Provide either makeNetFunc or sims (preferred).")
+} 
+if (!is.null(makeNetFunc) && !is.null(sims)){
+	stop("Provide either makeNetFunc or sims (preferred).")
+}
+
+if (!is.null(sims))	{
+	if (class(sims)!="list") stop("sims must be a list.")
+	if (all.equal(sort(names(sims)),sort(names(groupList)))!=TRUE) 
+		stop("names(sims) must match names(groupList).")
+}
+return(TRUE)
+}
+
+#' Create PSN from provided similarities
+#'
+#' @details Called by CreatePSN_MultiData(), this is the function that converts user-provided
+#' simlarity metrics to internal netDx function calls to generate nets.
+#' @param dataList (list) patient data, output of dataList2List()
+#' @param groupList (list) measure groupings. Keys match assays(dataList) and are usually different data sources. Values for each are a list of 
+#' networks with user-provided groupings. See groupList in buildPredictor() for details.
+#' @param netDir (char) path to directory where networks are to be created
+#' @param sims (list) keys must be identical to those of groupList. Values are either of type character, used for built-in similarity functions, 
+#' or are functions, when a custom function is provided.
+#' @param verbose (logical) print messages
+#' @param ... values to be passed to PSN creation functions such as makePSN_NamedMatrix().
+#' @export
+createNetFuncFromSimList <- function(dataList, groupList, netDir, sims,
+    verbose=TRUE,...){    
+    
+    if (length(groupList)!= length(sims)){
+        stop("groupList and sims need to be of same length.")
+    }
+    if (all.equal(sort(names(groupList)),sort(names(sims)))!=TRUE){
+        stop("names(groupList) needs to match names(sims).")
+    }
+    settings <- list(dataList=dataList,groupList=groupList,
+                    netDir=netDir,sims=sims)
+
+    if (verbose) message("Making nets from sims")
+    netList <- c()    
+    for (nm in names(sims)){
+        csim <- sims[[nm]]
+        netList_cur <- NULL
+        if (verbose) message(sprintf("\t%s",nm))
+
+        cur_set <- settings; 
+        cur_set[["name"]] <- nm; cur_set[["similarity"]] <- csim
+
+        if (!is.null(groupList[[nm]])){
+            if (class(csim)=="function") {# custom function
+    
+                netList_cur <- psn__custom(cur_set,csim, verbose,...)
+            } else if (csim == "pearsonCorr") {
+                netList_cur <- psn__corr(cur_set,verbose,...)
+            } else {
+                netList_cur <- psn__builtIn(cur_set,verbose,...)
+            }
+            netList <- c(netList,netList_cur)
+        }
+    }
+    if (verbose) {
+        message("Net construction complete!")
+    }
+    unlist(netList)
+}
+
+#' make PSN for built-in similarity functions
+#'
+#' @param settings (list) from makeNetFunc
+#' @param verbose (logical) print messages
+#' @param ... parameters for makePSN_NamedMatrix()
+#' @return (char) names of networks created. Side effect of network creation.
+psn__builtIn <- function(settings,verbose,...){
+
+funcs <- list(
+    "normDiff"=normDiff,
+    "avgNormDiff"=avgNormDiff,
+    "sim.pearscale"=sim.pearscale,
+    "sim.eucscale"=sim.eucscale
+)
+
+    if (verbose) message(sprintf("Layer %s: Built-in function %s",
+            settings$name,settings$similarity))
+
+    nm <- settings$name
+    netList <- makePSN_NamedMatrix(
+        settings$dataList[[nm]],
+		rownames(settings$dataList[[nm]]),
+		settings$groupList[[nm]],
+        settings$netDir,
+		simMetric="custom",
+        customFunc=funcs[[settings$similarity]], # custom function
+		writeProfiles=FALSE,
+		sparsify=TRUE,...
+    )
+    netList
+}
+
+#' make PSN for custom similarity functions
+#'
+#' @param settings (list) from makeNetFunc
+#' @param fn (function) custom similarity function
+#' @param verbose (logical) print messages
+#' @param ... parameters for makePSN_NamedMatrix()
+#' @return (char) names of networks created. Side effect of network creation.
+psn__custom <- function(settings,fn,verbose, ...){
+    nm <- settings$name
+    if (verbose) message(sprintf("Layer %s: CUSTOM FUNCTION",settings$name))
+    netList <- makePSN_NamedMatrix(
+        settings$dataList[[nm]],
+		rownames(settings$dataList[[nm]]),
+		settings$groupList[[nm]],
+        settings$netDir,
+		simMetric="custom",customFunc=fn, # custom function
+		writeProfiles=FALSE,
+		sparsify=TRUE,...
+    )
+    netList
+}
+
+#' wrapper for PSNs using Pearson correlation
+#'
+#' @param settings (list) from makeNetFunc
+#' @param verbose (logical) print messages
+#' @param ... parameters for makePSN_NamedMatrix()
+#' @return (char) names of networks created. Side effect of network creation.
+psn__corr <- function(settings,verbose,...){
+    if (verbose) message(sprintf("Layer %s: PEARSON CORR",settings$name))
+    nm <- settings$name
+    netList <- makePSN_NamedMatrix(
+				xpr=settings$dataList[[nm]],
+				nm=rownames(settings$dataList[[nm]]),
+				namedSets=settings$groupList[[nm]],	
+				outDir=settings$netDir,	
+				verbose=FALSE, 			
+				writeProfiles=TRUE,  
+				...
+				)
+    return(netList)
+}
\ No newline at end of file
diff --git a/R/smooMutationPropagation.R b/R/smooMutationPropagation.R
deleted file mode 100755
index 935ca86d..00000000
--- a/R/smooMutationPropagation.R
+++ /dev/null
@@ -1,176 +0,0 @@
-# Functions for smoothing mutation networks using interaction networks
-
-#' This function applies the random walk with restart propagation algorithm to a
-#' matrix of patients profiles
-#'
-#' @details A network is an undirected graph G defined by a set of nodes
-#'   corresponding to genes, and edges connecting nodes with an experimental
-#'   evidence of interaction. A priori nodes are genes for which an information
-#'   is known. A novel node is a candidate for being associated to the nodes
-#'   above based on their information. A node prediction task leads to detect
-#'   novel nodes and propagation techniques are largely applied for the purpose.
-#'   Network-based propagation algorithms for node prediction transfer the
-#'   information from a priori nodes to any other node in a network. Each node
-#'   gets an imputation value which assesses how much information got. The
-#'   prediction is based on the guilty-by-association principle. A node with a
-#'   high imputation value has a high probability to be associated to a priori
-#'   nodes. E.g. in a house where room A has one heater, if room B is the second
-#'   hottest room it means that B is close to A and that there is a high
-#'   probability that they share a door or wall. These algorithms exploit the
-#'   global topology of the network. However, when they are applied to detect if
-#'   unknown nodes are functionally associated to known ones, they may suffer of
-#'   a drawback depending by the context. In biology, two functionally related
-#'   fragments interact physically (direct interaction) or interact indirectly
-#'   thanks to one or very few mediators. Therefore, exploring too far
-#'   similarities between nodes can introduce noise in the prediction. We apply
-#'   a random walk with restart propagation algorithm which resolution is set to
-#'   0.2 for giving high values only to the close neighbours of the a priori
-#'   nodes.
-#' @param mat (data.frame) Sparse matrix of binarized patient profiles, with
-#'	rownames being unique patients and columns, unique genes. Entry [i,j] is
-#' 	set to 1 if patient j has a mutation in gene i.
-#' @param net (data.frame) Interaction network provided as an adjacency
-#' matrix (i.e. symmetric)
-#' @param numCores (integer) Number of cores for parallel processing
-#' @return (data.frame) Continuous matrix of patient profiles in which each gene
-#'   has the final propagation score
-#' @import doParallel
-#' @examples 
-#' suppressWarnings(suppressMessages(require(MultiAssayExperiment)))
-#' require(doParallel)
-#' 
-#' # load mutation and phenotype data
-#' genoFile <- system.file("extdata","TGCT_mutSmooth_geno.txt",package="netDx")
-#' geno <- read.delim(genoFile,sep="\t",header=TRUE,as.is=TRUE)
-#' phenoFile <- system.file("extdata", "TGCT_mutSmooth_pheno.txt",
-#'				package="netDx")
-#' pheno <- read.delim(phenoFile,sep="\t",header=TRUE,as.is=TRUE)
-#' rownames(pheno) <- pheno$ID
-#' 
-#' # load interaction nets to smooth over
-#' require(BiocFileCache)
-#' netFileURL <- paste("https://download.baderlab.org/netDx/",
-#' 	"supporting_data/CancerNets.txt",sep="")
-#' cache <- rappdirs::user_cache_dir(appname = "netDx")
-#' bfc <- BiocFileCache::BiocFileCache(cache,ask=FALSE)
-#' netFile <- bfcrpath(bfc,netFileURL)
-#' cancerNets <- read.delim(netFile,sep="\t",header=TRUE,as.is=TRUE)
-#' # smooth mutations
-#' prop_net <- smoothMutations_LabelProp(geno,cancerNets,numCores=1L)
-#' @export
-smoothMutations_LabelProp <- function(mat,net,numCores=1L) {
-	if (class(mat) == "data.frame") mat <- as.matrix(mat)
-	if (class(net) == "data.frame") net <- as.matrix(net)
-  #Split the matrix into sections, each one will be processed by one core
-  inds <- split(seq_len(ncol(mat)), 
-		sort(rep_len(seq_len(numCores), 
-		ncol(mat))))
-
-  res.l <- list()
-
-  required <- c("scater","clusterExperiment","netSmooth")
-  ctr <- 0
-  for (cur in required) {
-    if (!requireNamespace(cur, quietly=TRUE)) {
-      message(sprintf("Package \"%s\" needed for smoothMutations_LabelProp() to work. Please install it."))
-      ctr <- ctr+1
-    }
-    if (ctr >0) stop("Please install needed packages before proceeding.",call.=FALSE)
-  }
-
-  #Apply parallelized propagation
-	cl <- makeCluster(numCores)
-	registerDoParallel(cl)
-
-	# to address the "no visible binding for global variable" error
-	k <- NULL
-  res.l <- foreach(k = 1:length(inds),
-	.packages=c("netSmooth","scater","clusterExperiment")) %dopar% {
-    nS.res=netSmooth::netSmooth(mat[,inds[[k]]], 
-      net , alpha=0.2, verbose = 'auto', 
-		  normalizeAdjMatrix = c("columns")) 
-    return(nS.res)
-  }
-	stopCluster(cl)
-
-  #Merge the results
-  nS.res <- do.call(cbind, res.l)
-
-  #Return the final propagated matrix
-  return(nS.res)
-}
-
-#' Apply discretization to the matrix resulted from the propagation on the
-#' sparse patient matrix
-#'
-#' @details This function is included in the netDx use case which involves
-#'   propagating the sparse matrix of patient's profiles to reduce its sparsity.
-#'   This function applies discretization on the propagated matrix of patient
-#'   profiles. It sets to 1 the genes which got the highest propagation value.
-#'   While, the remaining genes are set to 0. This discretization is driven by
-#'   the fact that higher is the propagation value and higher is the chance that
-#'   the gene is involved in the patient condition and expression/mutation
-#'   profile. On the contrary, genes which got either a medium or a low value
-#'   are not trustable.
-#' @param smoothedMutProfile (data.frame) continous matrix of patient profiles 
-#' resulting from applying :.,$ s/network-based propagation algorithm 
-#' (smoothMutations_LabelProp()) on a binary somatic mutation sparse matrix.
-#' @param unsmoothedMutProfile (data.frame) binary somatic mutation sparse 
-#' matrix. Rownames are unique genes. Colnames are unique patients. A cell 
-#' contains a zero or a one.
-#' @param nameDataset (char) for titles on plot
-#' @param n_topXmuts (numeric between 0 and 1) percent of top mutations
-#' to keep. This function converts these to 1.0 when binarizing, so they
-#' remain in the thresholded output matrix; other mutations are set to zero.
-#' @return (data.frame) binary somatic mutation matrix which sparsity has been 
-#' decreased
-#' @examples 
-#' suppressWarnings(suppressMessages(require(MultiAssayExperiment)))
-#' require(doParallel)
-#' 
-#' # load mutation and phenotype data
-#' genoFile <- system.file("extdata","TGCT_mutSmooth_geno.txt",package="netDx")
-#' geno <- read.delim(genoFile,sep="\t",header=TRUE,as.is=TRUE)
-#' phenoFile <- system.file("extdata", "TGCT_mutSmooth_pheno.txt",
-#'				package="netDx")
-#' pheno <- read.delim(phenoFile,sep="\t",header=TRUE,as.is=TRUE)
-#' rownames(pheno) <- pheno$ID
-#' 
-#' # load interaction nets to smooth over
-#' require(BiocFileCache)
-#' netFileURL <- paste("https://download.baderlab.org/netDx/",
-#' 	"supporting_data/CancerNets.txt",sep="")
-#' cache <- rappdirs::user_cache_dir(appname = "netDx")
-#' bfc <- BiocFileCache::BiocFileCache(cache,ask=FALSE)
-#' netFile <- bfcrpath(bfc,netFileURL)
-#' cancerNets <- read.delim(netFile,sep="\t",header=TRUE,as.is=TRUE)
-#' # smooth mutations
-#' prop_net <- smoothMutations_LabelProp(geno,cancerNets,numCores=1L)
-#' genoP <- thresholdSmoothedMutations(
-#'    prop_net,geno,"TGCT_CancerNets",c(20)
-#'   )
-#' @export 
-thresholdSmoothedMutations <- function(smoothedMutProfile,
-		unsmoothedMutProfile,
-		nameDataset,n_topXmuts=c(10)){
-  smoothedMutProfile=apply(-smoothedMutProfile,2,rank)
-  n_muts=colSums(unsmoothedMutProfile)
-  
-  smoothedMutProfiles_l=list()
-  for(k_top in 1:length(n_topXmuts)){
-    name_prop=paste(nameDataset,"_x",n_topXmuts[k_top],sep="")
-    n_new_muts=n_muts*n_topXmuts[k_top]
-    for(i_col in 1:length(n_new_muts)){
-      smoothedMutProfile[smoothedMutProfile[,i_col]<=n_new_muts[i_col],i_col]=1
-      smoothedMutProfile[smoothedMutProfile[,i_col]>n_new_muts[i_col],i_col]=0
-    }
-    smoothedMutProfiles_l[[name_prop]]=smoothedMutProfile
-  }
-  
-  if(length(smoothedMutProfiles_l)!=1){
-    return(smoothedMutProfiles_l)
-  }
-  if(length(smoothedMutProfiles_l)==1){
-    return(smoothedMutProfile)
-  }
-}
diff --git a/R/splitTestTrain_partition.R b/R/splitTestTrain_partition.R
index b14f9ab2..667b03d7 100644
--- a/R/splitTestTrain_partition.R
+++ b/R/splitTestTrain_partition.R
@@ -79,3 +79,44 @@ splitTestTrain_resampling <- function(pheno_DF, nFold = 3L, predClass,
     
     out
 }
+
+
+#' Subsample a hold-out set from a larger patient dataset
+#'
+#' @details Creates a partition of data to be used for model validation after initial model building.
+#' In netDx, buildPredictor() is used for model training, and selected features from this exercise
+#' are used to validate a held-out dataset with the predict() function. Note that this function
+#' identifies a random subsample, which may result in a validation sample that is not representative
+#' of your training bias. If this method is used, please use data exploration techniques (e.g. UMAP)
+#' to ensure that validation accuracy is not confounded by stratification.
+#' @param dataMAE (MultiAssayExperiment) patient data to be subsampled. Must have columns ID (patient ID)
+#' and STATUS
+#' @param pctValidation (numeric) Fraction of dataset to include in the validation set. Value from 0.05 to 0.95.
+#' @param verbose (logical)
+#' @return (list) Keys are trainMAE and validationMAE. These contain corresponding MultiAssayExperiments for
+#' training and test data
+#' @export
+subsampleValidationData <- function(dataMAE,pctValidation=0.2,verbose=TRUE) {
+    if (pctValidation < 0.05 || pctValidation > 0.95) stop("pctValidation should be between 0.05 and 0.95.")
+    if (missing(dataMAE)) stop("Supply dataMAE.")
+    if (class(dataMAE)!= "MultiAssayExperiment") stop("dataMAE must be an object of type MultiAssayExperiment.")
+
+    pheno <- colData(dataMAE)
+    st <- unique(pheno$STATUS)
+    nsamp <- round(pctValidation/length(st) * nrow(pheno))
+
+    idx_holdout <- c()
+    for (k in unique(pheno$STATUS)) {
+        idx_holdout <- c(idx_holdout, 
+            sample(which(pheno$STATUS == k),nsamp,FALSE)
+        )  
+    }
+
+    holdout <- dataMAE[,rownames(pheno)[idx_holdout]]
+    colData(holdout)$ID <- as.character(colData(holdout)$ID)
+    tokeep <- setdiff(1:nrow(pheno),idx_holdout)
+
+    dataMAE <- dataMAE[,rownames(pheno)[tokeep]]
+
+    return(list(trainMAE=dataMAE, validationMAE=holdout))
+}
\ No newline at end of file
diff --git a/R/utils.R b/R/utils.R
index dc05b943..3d428b41 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -20,6 +20,6 @@ getFileSep <- function(){
 #' @export
 randAlphanumString <- function(numStrings = 1L) {
   a <- do.call(paste0, replicate(5, sample(LETTERS, numStrings, TRUE), FALSE))
-  paste0(a, sprintf("%04d", sample(9999, numStrings, TRUE)), 
-		sample(LETTERS, numStrings, TRUE))
-}
+  paste0(a, sprintf("%04d", sample(9999, numStrings, TRUE)),
+  sample(LETTERS, numStrings, TRUE))
+}
\ No newline at end of file
diff --git a/R/zzz.R b/R/zzz.R
index dca43212..ee2f0bbe 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -1,3 +1,18 @@
 .onLoad <- function(libname, pkgname) {
   options(java.parameters = c("-Xmx10G"))
+  message("Checking if Java runtime is installed  ...")
+  x <- system2("java", "--version", stdout = TRUE, stderr = NULL)
+  if (length(x)>0) {
+    message("Java detected.")
+    verNum <- as.integer(strsplit(strsplit(x[1], " ")[[1]][2], "\\.")[[1]][1])
+    if (verNum > 16) {
+      stop(paste("Incorrect Java version.\n",
+        " Your Java version is ", verNum, ".",
+        " netDx requires Java 16 or earlier to run.",
+        " Please see https://github.com/RealPaiLab/netDx/blob/master/README.md for instructions on how to do this.",
+        sep="", collapse=""))
+    }
+  } else {
+      stop("Java not detected. Please install before proceeding.")
+  }
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index ec10561b..bbf08b79 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,109 @@
+# netDx: Network-based patient classifier
+[![Docker Build](https://github.com/RealPaiLab/netDx/actions/workflows/push-docker.yml/badge.svg)](https://github.com/RealPaiLab/netDx/actions/workflows/push-docker.yml)
+[![R CMD check bioc](https://github.com/RealPaiLab/netDx/actions/workflows/check-bioc.yml/badge.svg)](https://github.com/RealPaiLab/netDx/actions/workflows/check-bioc.yml)
+
+netDx is for biomedical researchers who want to integrate multi-modal patient data to predict outcome or patient subtype. 
+netDx builds interpretable machine-learning patient classifiers. Unlike standard machine-learning tools, netDx allows modeling of user-defined biological groups as input features; examples include pathways and co-regulated elements. In addition to patient classification, top-scoring features provide mechanistic insight, helping drive hypothesis generation for downstream experiments. netDx currently provides native support for pathway-level features but can be generalized to any user-defined data type and grouping.
+
+## Install
+
+### Get Dockerized version
+Get the latest stable version of netDx with all dependencies installed in a Docker container. 
+
+Connect your code and data to the container by mapping volumes from your machine to Docker and run RStudio in a web browser. See example below.
+
+These steps were tested on a 2020 Mac Book Pro with 32 Gb RAM.
+
+1) Install Docker, and under Resources for Docker Desktop's settings set Memory to 5 GB.
+2) In terminal, pull the netDx image from Docker:
+	`docker pull realpailab/netdx`
+3) To run the image as a container,  save the following code as a bash script named `startDocker.sh`:
+```
+#!/bin/bash
+	
+docker run -d --rm \
+    -p 8787:8787 \
+    -e PASSWORD=netdx \
+    -v /path/to/software/dir:/software \
+    -v /path/to/data/dir:/data \
+    --name [containerName] \
+    realpailab/netdx
+```
+4) Modify the bash script so that it's executable and run it:
+```
+chmod u+x startDocker.sh
+./startDocker.sh
+```
+5) Access RStudio by going to your web browser and typing in `localhost:8787`, and sign in with the username `rstudio` and password `netdx`.
+6) You can then run the vignettes in R:
+```
+setwd("/home/rstudio/vignettes")
+rmarkdown::render("RunPredictorWorkflow.Rmd")
+```
+
+
+### Install in Rstudio
+
+
+1. If you don't already have it, install [base R](https://www.r-project.org/) and [RStudio Desktop](https://www.rstudio.com/products/rstudio/download/
+
+2. Install Java runtime for your OS.
+**Note:** netDx requires *Java 16 or earlier*; it will not work with Java 17 or higher.
+If you have Java 16 or earlier, go to step 3.
+
+**Unix:** Find and install the latest JDK and JRE libraries by running this on terminal:
+```
+apt-cache search jdk
+sudo apt-get install openjdk-11-jre openjdk-11-jdk
+```
+This operation requires system administrator privileges. You are done, move to Step 3.
+
+**OS X:** 
+[Download Java 16.0.1 (build 16.0.1+9) for 64-bit Mac](https://download.java.net/java/GA/jdk16.0.1/7147401fd7354114ac51ef3e1328291f/9/GPL/openjdk-16.0.1_osx-x64_bin.tar.gz) and run the installer.
+
+Install it by opening terminal and typing the following:
+```
+sudo mv openjdk-16.0.1_osx-x64_bin.tar.gz /Library/Java/JavaVirtualMachines/
+cd /Library/Java/JavaVirtualMachines/
+sudo tar -xzf openjdk-16.0.1_osx-x64_bin.tar.gz
+sudo rm openjdk-16.0.1_osx-x64_bin.tar.gz
+```
+The above operation requires `sudo` privileges (system admin privileges) on the machine.
+
+Now in Rstudio, change the value of `JAVA_HOME` to point it to this version of Java:
+```
+Sys.setenv(JAVA_HOME="/Library/Java/JavaVirtualMachines/jdk-16.0.1.jdk/Contents/Home")
+```
+
+Check to make sure R is using Java 16.0.1:
+```
+system2("java","--version")
+```
+
+If you see this, you're set!
+```
+openjdk 16.0.1 2021-04-20
+OpenJDK Runtime Environment (build 16.0.1+9-24)
+OpenJDK 64-Bit Server VM (build 16.0.1+9-24, mixed mode, sharing)
+```
+
+3. In R, [install BioConductor](https://www.bioconductor.org/install/) if needed. 
+```
+if (!requireNamespace("BiocManager", quietly = TRUE))
+    install.packages("BiocManager")
+BiocManager::install(version = "3.14")
+```
+
+4. Install netDx.
+```
+BiocManager::install("netDx",dependencies=TRUE)
+```
+
+
+---
+
+### Main repo for netDx dev work as of Sep 2021.
+
 netDx is a general-purpose algorithm for building patient classifiers by using patient similarity networks as features. It excels at interpretability and handling missing data. It also allows custom grouping rules for features, notably grouping genes into pathways. It integrates with RCy3 for network visualization of predictive pathways.
 
 As of February 2020, netDx is available via the BioConductor repository. 
@@ -5,6 +111,7 @@ Visit http://bioconductor.org/packages/release/bioc/html/netDx.html to install t
 
 Contact Shraddha Pai at shraddha.pai@utoronto.ca in case of questions.
 
+
 References: 
 
 1. Pai S, Hui S, Isserlin R, Shah MA, Kaka H and GD Bader (2019). netDx: Interpretable patient classification using patient similarity networks. *Mol Sys Biol*. 15: e8497. [Read the paper here](https://www.embopress.org/doi/full/10.15252/msb.20188497).
diff --git a/data/cnv_GR.rda b/data/cnv_GR.rda
index e4245fad..24ee75aa 100644
Binary files a/data/cnv_GR.rda and b/data/cnv_GR.rda differ
diff --git a/data/cnv_patientNetCount.rda b/data/cnv_patientNetCount.rda
index 0a72e9d3..a5739127 100644
Binary files a/data/cnv_patientNetCount.rda and b/data/cnv_patientNetCount.rda differ
diff --git a/data/cnv_pheno.rda b/data/cnv_pheno.rda
index 2431f8ca..ba3fbc31 100644
Binary files a/data/cnv_pheno.rda and b/data/cnv_pheno.rda differ
diff --git a/data/model_full.rda b/data/model_full.rda
new file mode 100644
index 00000000..d893c6ab
Binary files /dev/null and b/data/model_full.rda differ
diff --git a/data/toymodel.rda b/data/toymodel.rda
index caa1293c..e9c92383 100644
Binary files a/data/toymodel.rda and b/data/toymodel.rda differ
diff --git a/data/xpr.rda b/data/xpr.rda
index 37353053..aa7c22bc 100644
Binary files a/data/xpr.rda and b/data/xpr.rda differ
diff --git a/man/allowedSims.Rd b/man/allowedSims.Rd
new file mode 100644
index 00000000..06b08570
--- /dev/null
+++ b/man/allowedSims.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{allowedSims}
+\alias{allowedSims}
+\title{built-in similarity functions}
+\usage{
+allowedSims()
+}
+\description{
+built-in similarity functions
+}
diff --git a/man/buildPredictor.Rd b/man/buildPredictor.Rd
index b0480695..a4dca2da 100644
--- a/man/buildPredictor.Rd
+++ b/man/buildPredictor.Rd
@@ -7,8 +7,9 @@
 buildPredictor(
   dataList,
   groupList,
-  outDir = tempdir(),
-  makeNetFunc,
+  outDir = normalizePath(tempdir()),
+  makeNetFunc = NULL,
+  sims = NULL,
   featScoreMax = 10L,
   trainProp = 0.8,
   numSplits = 10L,
@@ -49,6 +50,10 @@ directory exists, its contents will be overwritten. Must be absolute path}
 \item{makeNetFunc}{(function) user-defined function for creating the set
 of input PSN provided to netDx. See createPSN_MultiData()::customFunc.}
 
+\item{sims}{(list) rules to create similarity networks from input data. Keys are names of
+data layers and should be identical to names(groupList). Values is either a character
+for built-in similarity functions; call allowedSims() to see full list; or a custom function.}
+
 \item{featScoreMax}{(integer) number of CV folds in inner loop}
 
 \item{trainProp}{(numeric 0 to 1) Percent samples to use for training}
@@ -214,6 +219,6 @@ makeNets <- function(dataList, groupList, netDir,...) {
 # takes 10 minutes to run
 #out <- buildPredictor(dataList=brca,groupList=groupList,
 #   makeNetFunc=makeNets, ### custom network creation function
-#   outDir=paste(tempdir(),"pred_output",sep=getFileSep()), ## absolute path
+#   outDir=paste(normalizePath(tempdir()),"pred_output",sep=getFileSep()), ## absolute path
 #   numCores=16L,featScoreMax=2L, featSelCutoff=1L,numSplits=2L)
 }
diff --git a/man/buildPredictor_sparseGenetic.Rd b/man/buildPredictor_sparseGenetic.Rd
index 7dbfce64..c413e84b 100644
--- a/man/buildPredictor_sparseGenetic.Rd
+++ b/man/buildPredictor_sparseGenetic.Rd
@@ -9,7 +9,7 @@ buildPredictor_sparseGenetic(
   cnv_GR,
   predClass,
   group_GRList,
-  outDir = tempdir(),
+  outDir = normalizePath(tempdir()),
   numSplits = 3L,
   featScoreMax = 10L,
   filter_WtSum = 100L,
@@ -147,7 +147,7 @@ cnv_GR    <- GRanges(pheno$seqnames,IRanges(pheno$start,pheno$end),
                         ID=pheno$ID,LOCUS_NAMES=pheno$Gene_symbols)
 
 # get gene coordinates
-geneURL <- paste("https://download.baderlab.org/netDx/",
+geneURL <- paste("http://download.baderlab.org/netDx/",
 	"supporting_data/refGene.hg18.bed",sep="")
 cache <- rappdirs::user_cache_dir(appname = "netDx")
 bfc <- BiocFileCache::BiocFileCache(cache,ask=FALSE)
@@ -158,7 +158,7 @@ gene_GR     <- GRanges(genes[,1],IRanges(genes[,2],genes[,3]),
    name=genes[,4])
 
 # create GRangesList of pathways
-pathFile <- fetchPathwayDefinitions("February",2018,verbose=TRUE)
+pathFile <- fetchPathwayDefinitions("February",2021,verbose=TRUE)
 pathwayList <- readPathways(pathFile)
 path_GRList <- mapNamedRangesToSets(gene_GR,pathwayList)
 
diff --git a/man/checkMakeNetFuncSims.Rd b/man/checkMakeNetFuncSims.Rd
new file mode 100644
index 00000000..3c8a7dbf
--- /dev/null
+++ b/man/checkMakeNetFuncSims.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{checkMakeNetFuncSims}
+\alias{checkMakeNetFuncSims}
+\title{internal test function to check validity of makeNetFunc and sims}
+\usage{
+checkMakeNetFuncSims(makeNetFunc, sims, groupList)
+}
+\arguments{
+\item{makeNetFunc}{(function) makeNetFunc from buildPredictor()}
+
+\item{sims}{(list) sims from buildPredictor()}
+
+\item{groupList}{(list) groupList from buildPredictor()s}
+}
+\value{
+(list) cleaned values for makeNetFunc and Sims
+}
+\description{
+internal test function to check validity of makeNetFunc and sims
+}
+\details{
+User must provide either makeNetFunc or sims. This function
+confirms this.
+}
diff --git a/man/checkSimValid.Rd b/man/checkSimValid.Rd
new file mode 100644
index 00000000..ca6626e2
--- /dev/null
+++ b/man/checkSimValid.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{checkSimValid}
+\alias{checkSimValid}
+\title{checks if provided similarity functions are valid. Returns error if not}
+\usage{
+checkSimValid(sims)
+}
+\arguments{
+\item{sims}{(list) keys are layer names, values are functions or characters (names of built-in similarity functions)}
+}
+\value{
+TRUE if all pass check. Else throws error.
+}
+\description{
+checks if provided similarity functions are valid. Returns error if not
+}
diff --git a/man/compileFeatures.Rd b/man/compileFeatures.Rd
index 4b79eb32..1443f2dc 100644
--- a/man/compileFeatures.Rd
+++ b/man/compileFeatures.Rd
@@ -6,7 +6,7 @@
 \usage{
 compileFeatures(
   netDir,
-  outDir = tempdir(),
+  outDir = normalizePath(tempdir()),
   simMetric = "pearson",
   netSfx = "txt$",
   verbose = TRUE,
@@ -90,14 +90,14 @@ makeNets <- function(dataList, groupList, netDir,...) {
 				writeProfiles=TRUE,...)
     unlist(netList)
 }
-tmpDir <- tempdir(); netDir <- paste(tmpDir,"nets",
+tmpDir <- normalizePath(tempdir()); netDir <- paste(tmpDir,"nets",
 sep=getFileSep())
 if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
 dir.create(netDir,recursive=TRUE)
 
 pheno_id <- setupFeatureDB(pheno,netDir)
 netList <- createPSN_MultiData(dataList=dataList, groupList=groupList,
-    pheno=pheno_id,netDir=netDir,customFunc=makeNets,verbose=TRUE)
+    pheno=pheno_id,netDir=netDir,makeNetFunc=makeNets,verbose=TRUE)
 
 outDir <- paste(tmpDir,'dbdir',sep=getFileSep()); 
 dir.create(outDir)
diff --git a/man/convertToMAE.Rd b/man/convertToMAE.Rd
new file mode 100644
index 00000000..3b15beea
--- /dev/null
+++ b/man/convertToMAE.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/convertToMAE.R
+\name{convertToMAE}
+\alias{convertToMAE}
+\title{Wrapper that converts an input list into a MultiAssayExperiment object}
+\usage{
+convertToMAE(dataList)
+}
+\arguments{
+\item{dataList}{(list) input key-value pairs (keys: data types, values: 
+data in the form of matrices/dataframes); must have a key-value pair that
+corresponds to patient IDs/metadata labelled pheno.}
+}
+\value{
+MAE (MultiAssayExperiment) data from input list incorporated into a
+MultiAssayExperiment object, compatible with further analysis using the 
+netDx algorithm.
+}
+\description{
+Wrapper that converts an input list into a MultiAssayExperiment object
+}
+\details{
+This function takes in a list of key-value pairs (keys: data types,
+values: matrices/dataframes) and calls the necessary functions from the
+MultiAssayExperiment package to incorporate the values from the input list 
+into a MultiAssayExperiment object, transforming the values according to the 
+keys. If duplicate sample names are found in the assay data, only the first
+instance is kept.
+}
+\examples{
+data(xpr, pheno)
+
+# Generate random proteomic data
+prot <- matrix(rnorm(100*20), ncol=20)
+colnames(prot) <- sample(pheno$ID, 20)
+rownames(prot) <- sprintf("protein\%i",1:100)	
+
+myList <- list(rna = xpr, proteomic = prot, pheno = pheno)
+
+MAE <- convertToMAE(myList)
+}
diff --git a/man/makeInputForEnrichmentMap.Rd b/man/createInputForFeatureNetworkView.Rd
similarity index 69%
rename from man/makeInputForEnrichmentMap.Rd
rename to man/createInputForFeatureNetworkView.Rd
index 07f7ef89..a19dc433 100644
--- a/man/makeInputForEnrichmentMap.Rd
+++ b/man/createInputForFeatureNetworkView.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/helper.R
-\name{makeInputForEnrichmentMap}
-\alias{makeInputForEnrichmentMap}
+\name{createInputForFeatureNetworkView}
+\alias{createInputForFeatureNetworkView}
 \title{Wrapper to create input files for Enrichment Map}
 \usage{
-makeInputForEnrichmentMap(
+createInputForFeatureNetworkView(
   model,
   results,
   pathwayList,
@@ -31,12 +31,17 @@ makeInputForEnrichmentMap(
 \item{outDir}{(char) directory where files should be written}
 }
 \value{
-
+(list) 1) GMTfiles (char): GMT files used to create EnrichmentMap in Cytoscape.
+2) NodeStyles (char): .txt files used to assign node attributes in Cytoscape. Importantly, 
+attributes include node fill, which indicates the highest consistent score for a given 
+feature.
 }
 \description{
 Wrapper to create input files for Enrichment Map
 }
 \details{
+Creates the input to visualize selected features and their relationships
+as a network in Cytoscape. The type of visualization is called an Enrichment Map.
 An Enrichment Map is a network-based visualization of top-scoring pathway features
 and themes. It is generated in Cytoscape. This script generates the input files needed
 for Cytoscape to create an Enrichment Map visualization.
diff --git a/man/createNetFuncFromSimList.Rd b/man/createNetFuncFromSimList.Rd
new file mode 100644
index 00000000..82a5142f
--- /dev/null
+++ b/man/createNetFuncFromSimList.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{createNetFuncFromSimList}
+\alias{createNetFuncFromSimList}
+\title{Create PSN from provided similarities}
+\usage{
+createNetFuncFromSimList(
+  dataList,
+  groupList,
+  netDir,
+  sims,
+  verbose = TRUE,
+  ...
+)
+}
+\arguments{
+\item{dataList}{(list) patient data, output of dataList2List()}
+
+\item{groupList}{(list) measure groupings. Keys match assays(dataList) and are usually different data sources. Values for each are a list of 
+networks with user-provided groupings. See groupList in buildPredictor() for details.}
+
+\item{netDir}{(char) path to directory where networks are to be created}
+
+\item{sims}{(list) keys must be identical to those of groupList. Values are either of type character, used for built-in similarity functions, 
+or are functions, when a custom function is provided.}
+
+\item{verbose}{(logical) print messages}
+
+\item{...}{values to be passed to PSN creation functions such as makePSN_NamedMatrix().}
+}
+\description{
+Create PSN from provided similarities
+}
+\details{
+Called by CreatePSN_MultiData(), this is the function that converts user-provided
+simlarity metrics to internal netDx function calls to generate nets.
+}
diff --git a/man/createPSN_MultiData.Rd b/man/createPSN_MultiData.Rd
index 3e6d45e5..200be81a 100644
--- a/man/createPSN_MultiData.Rd
+++ b/man/createPSN_MultiData.Rd
@@ -11,7 +11,8 @@ createPSN_MultiData(
   netDir = tempdir(),
   filterSet = NULL,
   verbose = TRUE,
-  customFunc,
+  makeNetFunc = NULL,
+  sims = NULL,
   ...
 )
 }
@@ -35,12 +36,17 @@ with internally-generated identifiers.}
 
 \item{verbose}{(logical) print messages}
 
-\item{customFunc}{(function) custom user-function to create PSN. 
+\item{makeNetFunc}{(function) custom user-function to create PSN. 
 Must take dataList,groupList,netDir as parameters. Must
 check if a given groupList is empty (no networks to create) before 
 the makePSN call for it. This is to avoid trying to make nets for datatypes
 that did not pass feature selection}
 
+\item{sims}{(list) Similarity metric settings for patient data. 
+Keys must be identical to those of groupList. 
+Values are either of type character, used for built-in similarity functions, 
+or are functions, when a custom function is provided.}
+
 \item{...}{other parameters to makePSN_NamedMatrix() or makePSN_RangedSets()}
 }
 \value{
@@ -124,5 +130,5 @@ netDir <- tempdir()
 pheno_id <- setupFeatureDB(colData(brca),netDir)
 createPSN_MultiData(dataList=datList2,groupList=groupList,
  pheno=pheno_id,
- netDir=netDir,customFunc=makeNets,numCores=1)
+ netDir=netDir,makeNetFunc=makeNets,numCores=1)
 }
diff --git a/man/fetchPathwayDefinitions.Rd b/man/fetchPathwayDefinitions.Rd
index 70a5ba93..8d4b4d11 100644
--- a/man/fetchPathwayDefinitions.Rd
+++ b/man/fetchPathwayDefinitions.Rd
@@ -11,7 +11,7 @@ fetchPathwayDefinitions(month = NULL, year = NULL, day = 1, verbose = FALSE)
 numeric or text (e.g. "January","April"). If NULL, fails.}
 
 \item{year}{(numeric) year of pathway definition file. Must be in
-yyyy format (e.g. 2018). If NULL, fails.}
+yyyy format (e.g. 2020). If NULL, fails.}
 
 \item{day}{(integer)}
 
@@ -33,7 +33,7 @@ For details see Merico D, Isserlin R, Stueker O, Emili A and GD Bader.
 (2010). PLoS One. 5(11):e13984.
 }
 \examples{
-fetchPathwayDefinitions("October",2020)
-fetchPathwayDefinitions("January",2018)
-fetchPathwayDefinitions(month=10,year=2020)
+fetchPathwayDefinitions("October",2021)
+fetchPathwayDefinitions("October",2021)
+fetchPathwayDefinitions(month=10,year=2021)
 }
diff --git a/man/getPSN.Rd b/man/getPSN.Rd
index b7c342e5..1e37f671 100644
--- a/man/getPSN.Rd
+++ b/man/getPSN.Rd
@@ -7,7 +7,8 @@
 getPSN(
   dat,
   groupList,
-  makeNets,
+  makeNetFunc = NULL,
+  sims = NULL,
   selectedFeatures,
   plotCytoscape = FALSE,
   aggFun = "MEAN",
@@ -22,9 +23,12 @@ getPSN(
 
 \item{groupList}{(list) feature groups, identical to groupList provided for buildPredictor()}
 
-\item{makeNets}{(function) Function used to create patient similarity networks. Identical to 
+\item{makeNetFunc}{(function) Function used to create patient similarity networks. Identical to 
 makeNets provided to buildPredictor()}
 
+\item{sims}{(list) rules for creating PSN. Preferred over makeNetFunc. See buildPredictor() 
+for details.}
+
 \item{selectedFeatures}{(list) selected features for each class (key of list). This object is returned as
 part of a call to getResults(), after running buildPredictor().}
 
diff --git a/man/getResults.Rd b/man/getResults.Rd
index 59808c90..ec0632b5 100644
--- a/man/getResults.Rd
+++ b/man/getResults.Rd
@@ -4,7 +4,13 @@
 \alias{getResults}
 \title{Compiles performance and selected features for a trained model.}
 \usage{
-getResults(res, status, featureSelCutoff = 1L, featureSelPct = 0)
+getResults(
+  res,
+  status,
+  featureSelCutoff = 1L,
+  featureSelPct = 0,
+  drawPerformancePlot = TRUE
+)
 }
 \arguments{
 \item{res}{(list) output of buildPredictor() function}
@@ -18,6 +24,8 @@ A feature must have minimum of this score for specified fraction of splits
 \item{featureSelPct}{(numeric between 0 and 1) cutoff percent for feature selection.
 A feature must have minimum score of featureSelCutoff for featureSelPct of 
 train/test splits to pass.}
+
+\item{drawPerformancePlot}{(logical) if TRUE, draws AUROC and AUPR plots. Set to FALSE to suppress graphical output.}
 }
 \value{
 list of results.
diff --git a/man/plotIntegratedPatientNetwork.Rd b/man/plotIntegratedPatientNetwork.Rd
index 3166a8b9..5ef12fc6 100644
--- a/man/plotIntegratedPatientNetwork.Rd
+++ b/man/plotIntegratedPatientNetwork.Rd
@@ -7,7 +7,8 @@
 plotIntegratedPatientNetwork(
   dataList,
   groupList,
-  makeNetFunc,
+  makeNetFunc = NULL,
+  sims = NULL,
   setName = "predictor",
   prune_pctX = 0.05,
   prune_useTop = TRUE,
@@ -33,6 +34,8 @@ clinical) and inner list to features to generate from that datatype.}
 
 \item{makeNetFunc}{(function) function to create features}
 
+\item{sims}{(list) rules for creating PSN. Preferred over makeNetFunc}
+
 \item{setName}{(char) name to assign the network in Cytoscape}
 
 \item{prune_pctX}{(numeric between 0 and 1) fraction of most/least 
diff --git a/man/plotPerf.Rd b/man/plotPerf.Rd
index 016341cd..6db2d1fd 100644
--- a/man/plotPerf.Rd
+++ b/man/plotPerf.Rd
@@ -4,7 +4,13 @@
 \alias{plotPerf}
 \title{Plots various measures of predictor performance for binary classifiers}
 \usage{
-plotPerf(resList = NULL, inFiles, predClasses, plotSEM = FALSE)
+plotPerf(
+  resList = NULL,
+  inFiles,
+  predClasses,
+  plotSEM = FALSE,
+  drawPlot = TRUE
+)
 }
 \arguments{
 \item{resList}{(list) list of prediction results. If provided, the method
@@ -17,6 +23,8 @@ A vector, each with absolute paths to predictionResults.txt}
 
 \item{plotSEM}{(logical) metric for error bars. If set to TRUE, plots SEM;
 else plots SD.}
+
+\item{drawPlot}{(logical) If TRUE, draws AUROC and AUPR curves.}
 }
 \value{
 (list) each key corresponds to an input file in inDir.
diff --git a/man/predict.Rd b/man/predict.Rd
index 57747a8e..78b8966a 100644
--- a/man/predict.Rd
+++ b/man/predict.Rd
@@ -8,10 +8,10 @@ predict(
   trainMAE,
   testMAE,
   groupList,
-  featSel,
-  makeNetFunc,
+  selectedFeatures,
+  makeNetFunc = NULL,
+  sims = NULL,
   outDir,
-  impute = FALSE,
   verbose = FALSE,
   numCores = 1L,
   JavaMemory = 4L,
@@ -25,17 +25,17 @@ predict(
 
 \item{groupList}{(list) list of features used to train the model. Keys are data types, and values are lists for groupings within those datatypes.
 e.g. keys could include {'clinical','rna','methylation'}, and values within 'rna' could include pathway names {'cell cycle', 'DNA repair'}, etc.,
-featSel will be used to subset}
+selectedFeatures will be used to subset}
 
-\item{featSel}{(list) selected features to be used in the predictive model. 
+\item{selectedFeatures}{(list) selected features to be used in the predictive model. 
 keys are patient labels (e.g. "responder/nonresponder"), and values are feature names 
 identified by running buildPredictor(). Feature names must correspond to names of groupList, from which they will be subset.}
 
 \item{makeNetFunc}{(function) function to create PSN features from patient data. See makeNetFunc in buildPredictor() for details}
 
-\item{outDir}{(char) directory for results}
+\item{sims}{(list) rules for creating PSN. Preferred over makeNetFunc.}
 
-\item{impute}{(logical) if TRUE imputes train and test samples separately before creating features. Currently unsupported.}
+\item{outDir}{(char) directory for results}
 
 \item{verbose}{(logical) print messages}
 
diff --git a/man/psn__builtIn.Rd b/man/psn__builtIn.Rd
new file mode 100644
index 00000000..edd3ab82
--- /dev/null
+++ b/man/psn__builtIn.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{psn__builtIn}
+\alias{psn__builtIn}
+\title{make PSN for built-in similarity functions}
+\usage{
+psn__builtIn(settings, verbose, ...)
+}
+\arguments{
+\item{settings}{(list) from makeNetFunc}
+
+\item{verbose}{(logical) print messages}
+
+\item{...}{parameters for makePSN_NamedMatrix()}
+}
+\value{
+(char) names of networks created. Side effect of network creation.
+}
+\description{
+make PSN for built-in similarity functions
+}
diff --git a/man/psn__corr.Rd b/man/psn__corr.Rd
new file mode 100644
index 00000000..c6d19968
--- /dev/null
+++ b/man/psn__corr.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{psn__corr}
+\alias{psn__corr}
+\title{wrapper for PSNs using Pearson correlation}
+\usage{
+psn__corr(settings, verbose, ...)
+}
+\arguments{
+\item{settings}{(list) from makeNetFunc}
+
+\item{verbose}{(logical) print messages}
+
+\item{...}{parameters for makePSN_NamedMatrix()}
+}
+\value{
+(char) names of networks created. Side effect of network creation.
+}
+\description{
+wrapper for PSNs using Pearson correlation
+}
diff --git a/man/psn__custom.Rd b/man/psn__custom.Rd
new file mode 100644
index 00000000..8fb9a0be
--- /dev/null
+++ b/man/psn__custom.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/similarities.R
+\name{psn__custom}
+\alias{psn__custom}
+\title{make PSN for custom similarity functions}
+\usage{
+psn__custom(settings, fn, verbose, ...)
+}
+\arguments{
+\item{settings}{(list) from makeNetFunc}
+
+\item{fn}{(function) custom similarity function}
+
+\item{verbose}{(logical) print messages}
+
+\item{...}{parameters for makePSN_NamedMatrix()}
+}
+\value{
+(char) names of networks created. Side effect of network creation.
+}
+\description{
+make PSN for custom similarity functions
+}
diff --git a/man/smoothMutations_LabelProp.Rd b/man/smoothMutations_LabelProp.Rd
deleted file mode 100644
index 51547487..00000000
--- a/man/smoothMutations_LabelProp.Rd
+++ /dev/null
@@ -1,75 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/smooMutationPropagation.R
-\name{smoothMutations_LabelProp}
-\alias{smoothMutations_LabelProp}
-\title{This function applies the random walk with restart propagation algorithm to a
-matrix of patients profiles}
-\usage{
-smoothMutations_LabelProp(mat, net, numCores = 1L)
-}
-\arguments{
-\item{mat}{(data.frame) Sparse matrix of binarized patient profiles, with
-rownames being unique patients and columns, unique genes. Entry [i,j] is
-    set to 1 if patient j has a mutation in gene i.}
-
-\item{net}{(data.frame) Interaction network provided as an adjacency
-matrix (i.e. symmetric)}
-
-\item{numCores}{(integer) Number of cores for parallel processing}
-}
-\value{
-(data.frame) Continuous matrix of patient profiles in which each gene
-  has the final propagation score
-}
-\description{
-This function applies the random walk with restart propagation algorithm to a
-matrix of patients profiles
-}
-\details{
-A network is an undirected graph G defined by a set of nodes
-  corresponding to genes, and edges connecting nodes with an experimental
-  evidence of interaction. A priori nodes are genes for which an information
-  is known. A novel node is a candidate for being associated to the nodes
-  above based on their information. A node prediction task leads to detect
-  novel nodes and propagation techniques are largely applied for the purpose.
-  Network-based propagation algorithms for node prediction transfer the
-  information from a priori nodes to any other node in a network. Each node
-  gets an imputation value which assesses how much information got. The
-  prediction is based on the guilty-by-association principle. A node with a
-  high imputation value has a high probability to be associated to a priori
-  nodes. E.g. in a house where room A has one heater, if room B is the second
-  hottest room it means that B is close to A and that there is a high
-  probability that they share a door or wall. These algorithms exploit the
-  global topology of the network. However, when they are applied to detect if
-  unknown nodes are functionally associated to known ones, they may suffer of
-  a drawback depending by the context. In biology, two functionally related
-  fragments interact physically (direct interaction) or interact indirectly
-  thanks to one or very few mediators. Therefore, exploring too far
-  similarities between nodes can introduce noise in the prediction. We apply
-  a random walk with restart propagation algorithm which resolution is set to
-  0.2 for giving high values only to the close neighbours of the a priori
-  nodes.
-}
-\examples{
-suppressWarnings(suppressMessages(require(MultiAssayExperiment)))
-require(doParallel)
-
-# load mutation and phenotype data
-genoFile <- system.file("extdata","TGCT_mutSmooth_geno.txt",package="netDx")
-geno <- read.delim(genoFile,sep="\t",header=TRUE,as.is=TRUE)
-phenoFile <- system.file("extdata", "TGCT_mutSmooth_pheno.txt",
-			package="netDx")
-pheno <- read.delim(phenoFile,sep="\t",header=TRUE,as.is=TRUE)
-rownames(pheno) <- pheno$ID
-
-# load interaction nets to smooth over
-require(BiocFileCache)
-netFileURL <- paste("https://download.baderlab.org/netDx/",
-	"supporting_data/CancerNets.txt",sep="")
-cache <- rappdirs::user_cache_dir(appname = "netDx")
-bfc <- BiocFileCache::BiocFileCache(cache,ask=FALSE)
-netFile <- bfcrpath(bfc,netFileURL)
-cancerNets <- read.delim(netFile,sep="\t",header=TRUE,as.is=TRUE)
-# smooth mutations
-prop_net <- smoothMutations_LabelProp(geno,cancerNets,numCores=1L)
-}
diff --git a/man/subsampleValidationData.Rd b/man/subsampleValidationData.Rd
new file mode 100644
index 00000000..588d3138
--- /dev/null
+++ b/man/subsampleValidationData.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/splitTestTrain_partition.R
+\name{subsampleValidationData}
+\alias{subsampleValidationData}
+\title{Subsample a hold-out set from a larger patient dataset}
+\usage{
+subsampleValidationData(dataMAE, pctValidation = 0.2, verbose = TRUE)
+}
+\arguments{
+\item{dataMAE}{(MultiAssayExperiment) patient data to be subsampled. Must have columns ID (patient ID)
+and STATUS}
+
+\item{pctValidation}{(numeric) Fraction of dataset to include in the validation set. Value from 0.05 to 0.95.}
+
+\item{verbose}{(logical)}
+}
+\value{
+(list) Keys are trainMAE and validationMAE. These contain corresponding MultiAssayExperiments for
+training and test data
+}
+\description{
+Subsample a hold-out set from a larger patient dataset
+}
+\details{
+Creates a partition of data to be used for model validation after initial model building.
+In netDx, buildPredictor() is used for model training, and selected features from this exercise
+are used to validate a held-out dataset with the predict() function. Note that this function
+identifies a random subsample, which may result in a validation sample that is not representative
+of your training bias. If this method is used, please use data exploration techniques (e.g. UMAP)
+to ensure that validation accuracy is not confounded by stratification.
+}
diff --git a/man/tSNEPlotter.Rd b/man/tSNEPlotter.Rd
index 9bc4a93c..a4cfaef6 100644
--- a/man/tSNEPlotter.Rd
+++ b/man/tSNEPlotter.Rd
@@ -12,7 +12,7 @@ matrix (symmetric). Row and column names are patient IDs. Note that NA
 values will be replaced by very small number (effectively zero).}
 
 \item{pheno}{(data.frame) Patient labels. ID column is patient ID and 
-STATUS is patient label of interest. tSNE will colour-code nodes by 
+STATUS is patient label xof interest. tSNE will colour-code nodes by 
 patient label.}
 
 \item{...}{Parameters for Rtsne() function.}
diff --git a/man/thresholdSmoothedMutations.Rd b/man/thresholdSmoothedMutations.Rd
deleted file mode 100644
index c8ea9cff..00000000
--- a/man/thresholdSmoothedMutations.Rd
+++ /dev/null
@@ -1,74 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/smooMutationPropagation.R
-\name{thresholdSmoothedMutations}
-\alias{thresholdSmoothedMutations}
-\title{Apply discretization to the matrix resulted from the propagation on the
-sparse patient matrix}
-\usage{
-thresholdSmoothedMutations(
-  smoothedMutProfile,
-  unsmoothedMutProfile,
-  nameDataset,
-  n_topXmuts = c(10)
-)
-}
-\arguments{
-\item{smoothedMutProfile}{(data.frame) continous matrix of patient profiles 
-resulting from applying :.,$ s/network-based propagation algorithm 
-(smoothMutations_LabelProp()) on a binary somatic mutation sparse matrix.}
-
-\item{unsmoothedMutProfile}{(data.frame) binary somatic mutation sparse 
-matrix. Rownames are unique genes. Colnames are unique patients. A cell 
-contains a zero or a one.}
-
-\item{nameDataset}{(char) for titles on plot}
-
-\item{n_topXmuts}{(numeric between 0 and 1) percent of top mutations
-to keep. This function converts these to 1.0 when binarizing, so they
-remain in the thresholded output matrix; other mutations are set to zero.}
-}
-\value{
-(data.frame) binary somatic mutation matrix which sparsity has been 
-decreased
-}
-\description{
-Apply discretization to the matrix resulted from the propagation on the
-sparse patient matrix
-}
-\details{
-This function is included in the netDx use case which involves
-  propagating the sparse matrix of patient's profiles to reduce its sparsity.
-  This function applies discretization on the propagated matrix of patient
-  profiles. It sets to 1 the genes which got the highest propagation value.
-  While, the remaining genes are set to 0. This discretization is driven by
-  the fact that higher is the propagation value and higher is the chance that
-  the gene is involved in the patient condition and expression/mutation
-  profile. On the contrary, genes which got either a medium or a low value
-  are not trustable.
-}
-\examples{
-suppressWarnings(suppressMessages(require(MultiAssayExperiment)))
-require(doParallel)
-
-# load mutation and phenotype data
-genoFile <- system.file("extdata","TGCT_mutSmooth_geno.txt",package="netDx")
-geno <- read.delim(genoFile,sep="\t",header=TRUE,as.is=TRUE)
-phenoFile <- system.file("extdata", "TGCT_mutSmooth_pheno.txt",
-			package="netDx")
-pheno <- read.delim(phenoFile,sep="\t",header=TRUE,as.is=TRUE)
-rownames(pheno) <- pheno$ID
-
-# load interaction nets to smooth over
-require(BiocFileCache)
-netFileURL <- paste("https://download.baderlab.org/netDx/",
-	"supporting_data/CancerNets.txt",sep="")
-cache <- rappdirs::user_cache_dir(appname = "netDx")
-bfc <- BiocFileCache::BiocFileCache(cache,ask=FALSE)
-netFile <- bfcrpath(bfc,netFileURL)
-cancerNets <- read.delim(netFile,sep="\t",header=TRUE,as.is=TRUE)
-# smooth mutations
-prop_net <- smoothMutations_LabelProp(geno,cancerNets,numCores=1L)
-genoP <- thresholdSmoothedMutations(
-   prop_net,geno,"TGCT_CancerNets",c(20)
-  )
-}
diff --git a/man/plotEmap.Rd b/man/viewSelectedFeaturesAsNetworks.Rd
similarity index 92%
rename from man/plotEmap.Rd
rename to man/viewSelectedFeaturesAsNetworks.Rd
index 287ede23..67d4a5f5 100644
--- a/man/plotEmap.Rd
+++ b/man/viewSelectedFeaturesAsNetworks.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/plotEmap.R
-\name{plotEmap}
-\alias{plotEmap}
+\name{viewSelectedFeaturesAsNetworks}
+\alias{viewSelectedFeaturesAsNetworks}
 \title{Create EnrichmentMap in Cytoscape to visualize predictive pathways}
 \usage{
-plotEmap(
+viewSelectedFeaturesAsNetworks(
   gmtFile,
   nodeAttrFile,
   netName = "generic",
@@ -65,7 +65,7 @@ topology.}
 Cluster labels remain visible.}
 }
 \value{
-No value. Side effect of plotting the EnrichmentMap in an open 
+No value. Side effect of plotting the network view for features in an open 
 session of Cytoscape.
 }
 \description{
@@ -93,6 +93,6 @@ gmtFile <- EMap_input[[1]][1]
 nodeAttrFile <- EMap_input[[1]][2]
 
 # not run because requires Cytoscape to be installed and open
-# plotEmap(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, 
+# viewSelectedFeaturesAsNetworks(gmtFile = gmtFile, nodeAttrFile = nodeAttrFile, 
 #\t\tnetName='HighRisk')
 }
diff --git a/tests/testthat/pathway_ex3.gmt b/tests/testthat/pathway_ex3.gmt
new file mode 100644
index 00000000..9073b6e1
--- /dev/null
+++ b/tests/testthat/pathway_ex3.gmt
@@ -0,0 +1,3 @@
+STEARATE BIOSYNTHESIS I (ANIMALS)%HUMANCYC%PWY-5972	stearate biosynthesis I (animals)	ELOVL1	ACOT7	ACSL1	ACSL5	ELOVL6	ACSL4	ACSL3	ACOT2	ACOT1	ACSBG1	ACSBG2	SLC27A2	ACOT4	
+PUTRESCINE DEGRADATION III%HUMANCYC%PWY-0	putrescine degradation III	ALDH3A2	ALDH3B2	ALDH3A1	ALDH1B1	MAOB	ALDH2	MAOA	ALDH3B1	SAT2	SAT1	
+TRYPTOPHAN DEGRADATION III (EUKARYOTIC)%HUMANCYC%TRYPTOPHAN-DEGRADATION-1	tryptophan degradation III (eukaryotic)	ACAT1	HADHB	GCDH	TDO2	KYNU	HAAO	AFMID	KMO	ACAA1	ACAT2	ACMSD	
diff --git a/tests/testthat/prepare_data.R b/tests/testthat/prepare_data.R
new file mode 100755
index 00000000..761dc1b7
--- /dev/null
+++ b/tests/testthat/prepare_data.R
@@ -0,0 +1,48 @@
+# setup brca data
+prepareData <- function(dat, setBinary=FALSE) {
+### clean up stage variable
+staget <- sub("[abcd]","",sub("t","",colData(dat)$pathology_T_stage))
+staget <- suppressWarnings(as.integer(staget))
+colData(dat)$STAGE <- staget
+
+### remove NA PAM50 calls, remove normal samples
+tmp <- colData(dat)$PAM50.mRNA
+if (!setBinary){
+	idx <- which(tmp %in% c("Normal-like","HER2-enriched"))
+} else {
+	idx <- union(which(tmp %in% c("Normal-like","HER2-enriched","Luminal B")),
+			which(is.na(staget)))
+}
+idx <- union(idx, which(is.na(tmp)))
+pID <- colData(dat)$patientID
+tokeep <- setdiff(pID, pID[idx])
+dat <- dat[,tokeep,]
+pam50 <- colData(dat)$PAM50.mRNA
+
+### where a patient has multiple instances of the same assay
+### just keep the first instance encountered
+smp <- sampleMap(dat)
+expr <- assays(dat)
+for (k in 1:length(expr)) {
+	samps <- smp[which(smp$assay==names(expr)[k]),]
+	notdup <- samps[which(!duplicated(samps$primary)),"colname"]
+	#message(sprintf("%s: %i notdup", names(expr)[k], length(notdup)))
+	dat[[k]] <- suppressMessages(dat[[k]][,notdup])
+}
+
+### create ID, STATUS columns, remove spaces/hyphens from patient labels
+pID <- colData(dat)$patientID
+colData(dat)$ID <- pID
+colData(dat)$STATUS <- pam50
+colData(dat)$STATUS <- gsub(" ",".",colData(dat)$STATUS)
+colData(dat)$STATUS <- gsub("-",".",colData(dat)$STATUS)
+
+if (setBinary){
+	st <- colData(dat)$STATUS
+	st[which(!st %in% "Luminal.A")] <- "other"
+	colData(dat)$STATUS <- st
+}
+
+return(dat)
+}
+
diff --git a/tests/testthat/test_buildpredictor.R b/tests/testthat/test_buildpredictor.R
index 1678d8a2..d0c19c65 100644
--- a/tests/testthat/test_buildpredictor.R
+++ b/tests/testthat/test_buildpredictor.R
@@ -26,7 +26,7 @@ test_that("split test/train works", {
 
 })
 
-test_that("feature construction and compilation",{
+test_that("feature construction and compilation works",{
 	# 20 patients, 10 case, 10 control
 	pheno <- data.frame(ID=sprintf("PAT%i",1:20),
 		STATUS=rep(c("case","control"),each=10))
@@ -69,12 +69,13 @@ test_that("feature construction and compilation",{
 	# directory contains GENES.TXT, NETWORKS.TXT INTERACTIONS folder
 	 outDir <- tempdir()
 	netDir <- sprintf("%s/tmp",outDir)
+	if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
 	dir.create(netDir)
 
 	pheno_id <- setupFeatureDB(pheno,netDir)
 	x <-createPSN_MultiData(dataList=dataList,groupList=groupList,
 			pheno=pheno_id,
-			netDir=netDir,customFunc=makeNets,numCores=1,
+			netDir=netDir,makeNetFunc=makeNets,numCores=1,
 			verbose=FALSE)
 	
 	# number of nets equals those submitted for creation
@@ -102,35 +103,53 @@ test_that("feature construction and compilation",{
 	expect_match(dir(dbDir$dbDir,"lucene.index"),"lucene.index") # copied ok
 })
 
-###test_that("making PSN works: Pearson", {
-###	# create network
-###  # read in file
-###  # take two patients, compute pearson, check with outptu in file
-###	# num .profile files are greater than zero
-###})
-
-###test_that("making PSN works: custom func", {
-###	# same
-###	# num _cont.txt files are greater than zero
-###})
-###
-#### ------------------------------------------
-#### feature selection
-#### ------------------------------------------
-###
-###test_that("queries are made correctly", {
-###	# patients are subset of incPat
-###})
-###
-###test_that("feature selection gives expected output", {
-###	# one dir per class
-###	# N PRANK and NRANK files
-###	# correct number of _pathwayCV_score.txt files
-###})
-###
-#### ------------------------------------------
-#### predicting test label
-#### ------------------------------------------
-###test_that("test patients are labelled correctly", {
-###	# are all test patients labelled?
-###})
+test_that("holding out and validation set prediction works as it should", {
+
+	data(xpr,pheno)
+	pheno$AGE <- runif(nrow(pheno),min=10,max=45)
+	pathList <- list(
+		a=sample(rownames(xpr),25,F),
+		b=sample(rownames(xpr),100,F),
+		c=sample(rownames(xpr),30,F)
+	)
+	suppressMessages(require(MultiAssayExperiment))
+	objlist <- list(a=SummarizedExperiment(xpr))
+	mae <- MultiAssayExperiment(objlist,pheno)
+	
+	# test validation set features
+	dset <- subsampleValidationData(mae,pctValidation=0.2)
+	testthat::expect_equal(nrow(colData(dset[[1]])) + nrow(colData(dset[[2]])),
+				nrow(colData(mae)) )
+	tst <- colData(dset[[2]])$STATUS
+	lbl <- unique(tst)
+	testthat::expect_equal(sum(tst == lbl[1]), sum(tst==lbl[2]))
+	testthat::expect_equal(0.2, round(length(tst)/nrow(colData(mae)),digits=1) )
+
+	outDir <- tempdir()
+	if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
+	dir.create(outDir)
+	featSel <- list(LumA=c("a.profile","age_cont.txt"),notLumA=c("a.profile","b.profile"))
+	predModel <- suppressMessages(
+  		predict(trainMAE=dset[[1]], testMAE=dset[[2]], 
+    		groupList=list(a=pathList,clinical=list(age="AGE")), 
+    		selectedFeatures=featSel,
+    		sims=list(a="pearsonCorr",clinical="normDiff"),
+    		outDir=outDir, verbose = FALSE)
+	)
+
+	for (g in c("LumA","notLumA")){
+		luma <- read.delim(sprintf("%s/%s/%s_query",outDir,g,g),sep=":")
+		pat <- unlist(strsplit(trimws(luma[1,1]),"\t")) 
+		# check that query patients are training samples for the corresponding class
+		testthat::expect_equal(sort(pat),sort(colData(dset[[1]])$ID[which(colData(dset[[1]])$STATUS==g)]))
+		# check that only selected features are used
+		nets <- read.delim(sprintf("%s/%s/networks/NETWORKS.txt",outDir,g),sep="\t",h=F)
+		testthat::expect_identical(featSel[[g]],nets[,2])
+		# test that only and all holdout set patients are being classified
+		dat <- read.delim(sprintf("%s/%s/%s_query-results.report.txt.PRANK",outDir,g,g),h=T,skip=1,sep="\t")
+		sb <- dat[which(!is.na(dat[,2])),]
+		testthat::expect_equal(nrow(colData(dset[[2]])), length(intersect(sb[,1],colData(dset[[2]])$ID)))		
+	}
+
+})
+
diff --git a/tests/testthat/test_convertToMAE.R b/tests/testthat/test_convertToMAE.R
new file mode 100644
index 00000000..2633a8bd
--- /dev/null
+++ b/tests/testthat/test_convertToMAE.R
@@ -0,0 +1,75 @@
+# test convertToMAE.R
+
+test_that("convertToMAE works", {
+    # 20 patients, 10 case, 10 control
+	pheno <- data.frame(ID=sprintf("PAT%i",1:20),
+		STATUS=rep(c("case","control"),each=10))
+	# 100 dummy genes
+	rna <- matrix(rnorm(100*20),nrow=100); 
+	colnames(rna) <- pheno$ID
+	rownames(rna) <- sprintf("gene%i",1:100)	
+	# 2 dummy clin variables
+	clin <- t(data.frame(AGE=runif(20,10,50)))
+	colnames(clin) <- pheno$ID
+    clin <- t(clin)
+
+	# netDx files
+	dataList <- list(rna=rna,pheno=clin)
+    
+    x <- convertToMAE(dataList)
+    expect_is(x, "MultiAssayExperiment")
+})
+
+test_that("convertToMAE works with more than one assay", {
+	# 20 patients, 10 case, 10 control
+	pheno <- data.frame(ID=sprintf("PAT%i",1:20),
+		STATUS=rep(c("case","control"),each=10))
+	# 100 dummy genes
+	rna <- matrix(rnorm(100*20),nrow=100); 
+	colnames(rna) <- pheno$ID
+	rownames(rna) <- sprintf("gene%i",1:100)
+	# 200 dummy proteins
+	prot <- matrix(rnorm(200*20), nrow = 200);
+	colnames(prot) <- pheno$ID
+	rownames(prot) <- sprintf("protein%i",1:200) 	
+	# 2 dummy clin variables
+	clin <- t(data.frame(AGE=runif(20,10,50)))
+	colnames(clin) <- pheno$ID
+    clin <- t(clin)
+
+	# netDx files
+	dataList <- list(rna = rna, proteomics = prot, pheno = clin)
+
+	x <- convertToMAE(dataList)
+	expect_is(x, "MultiAssayExperiment")
+})
+
+test_that ("convertToMAE removes duplicated sample", {
+	# 20 patients, 10 case, 10 control
+	pheno <- data.frame(ID=sprintf("PAT%i",1:20),
+                    STATUS=rep(c("case","control"),each=10))
+	# 100 dummy genes, with first sample duplicated
+	rna <- matrix(rnorm(100*20),nrow=100); 
+	colnames(rna) <- pheno$ID
+	rownames(rna) <- sprintf("gene%i",1:100)
+	rna <- cbind(rna, rna[,1])
+	colnames(rna)[21] <- colnames(rna)[1]
+	# 200 dummy proteins
+	prot <- matrix(rnorm(200*20), nrow = 200);
+	colnames(prot) <- pheno$ID
+	rownames(prot) <- sprintf("protein%i",1:200) 	
+	# 2 dummy clin variables
+	clin <- t(data.frame(AGE=runif(20,10,50)))
+	colnames(clin) <- pheno$ID
+	clin <- t(clin)
+	
+	# netDx files
+	dataList <- list(rna = rna, proteomics = prot, pheno = clin)
+
+	x <- convertToMAE(dataList)
+	expect_is(x, "MultiAssayExperiment")
+	# number of samples in rna assay vs colData should differ by 1
+	expect_equal((dim(rna)[2] - dim(colData(x))[1]), 1)
+	# number of samples in metadata should agree with colData
+	expect_equal((dim(clin)[1] - dim(colData(x))[1]), 0)
+})
\ No newline at end of file
diff --git a/tests/testthat/test_suite.R b/tests/testthat/test_misc.R
similarity index 99%
rename from tests/testthat/test_suite.R
rename to tests/testthat/test_misc.R
index 8719dac4..7ead8a77 100644
--- a/tests/testthat/test_suite.R
+++ b/tests/testthat/test_misc.R
@@ -10,7 +10,6 @@ test_that("readPathways works", {
 		expect_lt(max(ln),201)
 })
 
-
 ###test_that("lasso filtering works", {
 ###	# make own subroutine
 ###})
diff --git a/tests/testthat/test_psn_construction.R b/tests/testthat/test_psn_construction.R
new file mode 100644
index 00000000..9778e819
--- /dev/null
+++ b/tests/testthat/test_psn_construction.R
@@ -0,0 +1,126 @@
+
+
+###test_that("invalid sims is flagged",{
+###
+###    expect_equal(TRUE, checkSimValid(list(a="pearsonCorr")))
+###    expect_identical(TRUE, checkSimValid(list(a="pearsonCorr",b=function(x) 2+4)))
+###    expect_identical(TRUE, checkSimValid(list(a="normDiff")))
+###    expect_error(checkSimValid(list(a="normDifff")))
+###    expect_error(checkSimValid(list(a=2)))
+###})
+
+#test_that("psns constructed using provided sims",{
+    rm(list=ls())
+    library(curatedTCGAData)
+    library(netDx)
+    brca <- suppressMessages(
+    curatedTCGAData("BRCA",
+               c("mRNAArray","RPPA*","Methylation_methyl27*"),
+	dry.run=FALSE,version="1.1.38"))
+    
+    source("prepare_data.R")
+    brca <- prepareData(brca,setBinary=TRUE)
+
+    groupList <- list()
+
+    # genes in mRNA data are grouped by pathways
+    pathFile <- "pathway_ex3.gmt"
+    pathList <- readPathways(pathFile)
+    groupList[["BRCA_mRNAArray-20160128"]] <- pathList
+    # clinical data is not grouped; each variable is its own feature
+    groupList[["clinical"]] <- list(
+          age="patient.age_at_initial_pathologic_diagnosis",
+    	   stage="STAGE"
+    )
+    # for methylation generate one feature containing all probes
+    # same for proteomics data
+    for (k in 2:3){
+    tmp <- list(rownames(experiments(brca)[[k]]));
+    names(tmp) <- names(brca)[k]
+    groupList[[names(brca)[k]]] <- tmp
+    }
+
+dataList <- dataList2List(brca,groupList)
+pheno <- data.frame(INTERNAL_ID=1:nrow(colData(brca)),
+    ID=colData(brca)$ID)
+
+netDir <- paste(tempdir(),"nets",sep=getFileSep())
+# ----------
+message("test pearson and built-in function")
+if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
+dir.create(netDir,recursive=TRUE)
+sims <- list(a="pearsonCorr",b="normDiff",
+        c="pearsonCorr",d="pearsonCorr")
+names(sims) <- names(groupList)
+x <- createNetFuncFromSimList(dataList$assays,groupList,
+    netDir,sims)
+prof <- dir(netDir,".profile$")
+cont <- dir(netDir,"_cont.txt")
+testthat::expect_equal(7,length(x))
+pathways <- c(names(groupList[[1]]),names(groupList[3:4]))
+testthat::expect_identical(sort(pathways), sort(sub(".profile$","",prof)))
+print(all.equal(sort(names(groupList[[2]])),sort(sub("_cont.txt","",cont))))
+
+# ----------
+message("just pearson")
+gp1 <- groupList[1]; d1 <- dataList$assays[1]
+sims <- list(a="pearsonCorr"); names(sims) <- names(gp1)
+if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
+dir.create(netDir,recursive=TRUE)
+x <- createNetFuncFromSimList(d1,gp1,
+    netDir,sims)
+prof <- dir(netDir,".profile$")
+cont <- dir(netDir,"_cont.txt")
+testthat::expect_equal(length(prof),3)
+testthat::expect_equal(length(cont),0)
+
+message("just built-in")
+gp1 <- groupList[2]; d1 <- dataList$assays["clinical"]
+sims <- list(a="normDiff"); names(sims) <- names(gp1)
+if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
+dir.create(netDir,recursive=TRUE)
+x <- createNetFuncFromSimList(d1,gp1,
+    netDir,sims)
+prof <- dir(netDir,".profile$")
+cont <- dir(netDir,"_cont.txt")
+testthat::expect_equal(length(prof),0)
+testthat::expect_equal(length(cont),2)
+
+message("just custom")
+gp1 <- groupList[2]; d1 <- dataList$assays["clinical"]
+sims <- list(a=normDiff); names(sims) <- names(gp1)
+if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
+dir.create(netDir,recursive=TRUE)
+x <- createNetFuncFromSimList(d1,gp1,
+    netDir,sims)
+prof <- dir(netDir,".profile$")
+cont <- dir(netDir,"_cont.txt")
+testthat::expect_equal(length(prof),0)
+testthat::expect_equal(length(cont),2)
+
+message("test full net creation function")
+sims <- list(a="pearsonCorr",b="normDiff",
+        c="pearsonCorr",d="pearsonCorr")
+names(sims) <- names(groupList)
+if (file.exists(netDir)) unlink(netDir,recursive=TRUE)
+dir.create(netDir,recursive=TRUE)
+netList<- createPSN_MultiData(
+    dataList=dataList$assays,
+    groupList=groupList,
+    pheno=pheno, netDir=netDir,
+    makeNetFunc=NULL, sims=sims,
+    numCores=1,verbose=TRUE)
+expect_equal(7, length(netList))
+testthat::expect_equal(2, length(dir(sprintf("%s/INTERACTIONS",netDir))))
+
+
+# test full predictor
+#set.seed(42) # make results reproducible
+##out <- 
+##   buildPredictor(dataList=brca,groupList=groupList,
+##      sims=sims,
+##      outDir=outDir, ## netDx requires absolute path
+##      numSplits=2L, featScoreMax=2L, featSelCutoff=1L,
+##	  numCores=nco)
+##    
+#})
\ No newline at end of file
diff --git a/vignettes/BuildPredictor.Rmd b/vignettes/BuildPredictor.Rmd
deleted file mode 100755
index 7d0fa61c..00000000
--- a/vignettes/BuildPredictor.Rmd
+++ /dev/null
@@ -1,329 +0,0 @@
----
-title: "Building binary classifier from clinical and 'omic data using pathway-level features"
-author: "Shraddha Pai"
-package: netDx
-date: "`r Sys.Date()`"
-output: 
-  BiocStyle::html_document:
-    toc_float: true
-vignette: >
-    %\VignetteIndexEntry{01. Build binary predictor and view performance, top features and integrated Patient Similarity Network}.
-    %\VignetteEngine{knitr::rmarkdown}
-    %\VignetteEncoding{UTF-8}
----
-
-# Introduction
-
-In this example we will build a predictor to classify breast tumours as being either of Luminal A subtype or otherwise. The process is identical for classifying three or more labels, and the example uses minimal data for quick runtime.
-
-For this we will use data from the  The Cancer Genome Atlas, and will integrate three types of -omic data: 
-
-* gene expression from Agilent mRNA microarrays and
-* miRNA sequencing 
-
-
-```{r, fig.cap="We will integrate two layers of genomic data. Each layer will be converted into a single patient similarity network using Pearson correlation for pairwise similarity. ",echo=FALSE, crop=NULL}
-knitr::opts_chunk$set(crop=NULL)
-knitr::include_graphics("images/vignette1_design.jpg")
-```
-
-## Get and prepare data
-
-In this example, we use curated multi-modal data from The Cancer Genome Atlas, gotten from the BioConductor `curatedTCGAData` package. Data for all cancer types profiled in TCGA are available through this package; [see this tutorial for details](https://bioconductor.org/packages/release/data/experiment/vignettes/curatedTCGAData/inst/doc/curatedTCGAData.html). 
-
-```{r, eval=TRUE}
-suppressMessages(library(curatedTCGAData))
-```
-
-# Data 
-
-In this example, we use curated data from The Cancer Genome Atlas, through the BioConductor `curatedTCGAData` package. The goal is to classify a breast tumour into either a Luminal A subtype or otherwise. The predictor will integrate clinical variables selected by the user, along with gene expression data.
-
-Here we load the required packages and download clinical and gene expression data.
-```{r,eval=TRUE}
-suppressMessages(library(curatedTCGAData))
-```
-
-Let's take a look at the available data for breast cancer, without downloading any (set `dry.run=TRUE`). 
-Note that the new release of BioConductor (3.13) actually allows users to fetch one of two versions of TCGA data. 
-
-```{r, eval=TRUE}
-curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE, version="1.1.38")
-```
-Now let's actually download the data, getting just the three layers we need:
-
-```{r, eval=TRUE}
-brca <- suppressMessages(curatedTCGAData("BRCA",
-                                         c("mRNAArray", 
-										 "miRNASeqGene"),
-                                         dry.run=FALSE, version="1.1.38"))
-```
-
-This call returns a `MultiAssayExperiment` object. Recall that this is a container for storing multiple assays performed on the same set of samples. [See this tutorial](https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/QuickStartMultiAssay.html) to learn more.
-
-Let's briefly explore the `brca` `MultiAssayExperiment` object.
-
-```{r, eval=TRUE}
-brca
-```
-
-`assays()` returns a `list` with all -omic data associated with this object.  
-
-```{r, eval=TRUE}
-summary(assays(brca))
-```
-`names()` shows the datatypes in each slot of `assays()`:
-
-```{r, eval=TRUE}
-names(assays(brca))
-```
-
-So miRNA data is in slot 1, gene expression in slot 2, etc., 
-
-We can subset the data to see what it looks like. Let's do that for the miRNA data, looking at just the first five measures:
-
-```{r, eval=TRUE}
-mir <- assays(brca)[["BRCA_miRNASeqGene-20160128"]]
-head(mir[,1:5])
-```
-
-Patient metadata is contained in the `colData()` slot. Rows contain data for each patient and columns contain measures such as clinical characteristics:
-
-```{r, eval=TRUE}
-pheno <- colData(brca)
-colnames(pheno)[1:20]
-head(pheno[,1:5])
-```
-
-This next code block prepares the TCGA data. This includes:
-
-* removing duplicate samples
-* reformatting patient IDs (e.g. removing spaces and hyphens)
-* creating an `ID` column in `colData(brca)`, which contains unique patient IDs
-* creating a `STATUS` column in `colData(brca)` which contains the patient labels (i.e what we want netDx to classify). 
-
-
-In practice you would prepare the dataset once and save it to file, then separately load it before running netDx; i.e. decouple data processing and running the predictor. The data processing code has been moved into a supporting file, `prepare_data.R`. You can explore it after the lab to see how some things are achieved (e.g. removing duplicate samples). For now, let's just run it.
-
-```{r, eval=TRUE}
-source("prepare_data.R")
-brca <- prepareData(brca,setBinary=TRUE)
-```
-
-The important thing is to create `ID` and `STATUS` columns in the sample metadata table. netDx uses these to get the patient identifiers and labels, respectively.
-
-
-```{r, eval=TRUE}
-pheno <- colData(brca)
-head(pheno[,c("ID","STATUS")])
-table(pheno$STATUS,useNA="always")  # good practice: useNA="always" shows missing values
-```
-
-## Create feature design rules
-
-Now let's set up the data for input to netDx. 
-
-netDx allows the user to define how data is converted into patient similarity networks (or PSNs), which are the features that go into the model. This is done specifically by telling the model how to:
-
-* **group** different types of data and 
-* **define similarity** for each of these (e.g. Pearson correlation, normalized difference, etc.,).
-
-The relevant input parameters are:
-
-* `groupList`: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
-* `makeNets()`: an R function telling netDx what similarity metric to use for each data layer
-
-Let's start by loading the `netDx` package.
-
-
-```{r, eval=TRUE}
-suppressWarnings(suppressMessages(require(netDx)))
-```
-
-Let's set up each of the input arguments one by one.
-
-### groupList
-**What is this:** `groupList` tells netDx how to group measures within a layer, to generate a PSN. Measures could be individual genes, proteins, CpG bases (in DNA methylation data), clinical variables, etc., 
-
-In this simple example we just create a single PSN for each datatype, containing all measures from that datatype.
-
-
-```{r, eval=TRUE}
-expr <- assays(brca)
-groupList <- list()
-for (k in 1:length(expr)) {	# loop over all layers
-	cur <- expr[[k]]; nm <- names(expr)[k]
-
-	# all measures from this layer go into our single PSN
-	groupList[[nm]] <- list(nm=rownames(cur)) 
-
-	# assign same layer name as in input data
-	names(groupList[[nm]])[1] <- nm;
-}
-```
-Notice that `groupList` is a two tiered list, or list-of-lists. The first tier is for each data layers, with names matching those in `assays(brca)`. The second tier contains all the PSNs we want to make for that layer. In this lab exercise we create only one PSN per data layer, using all the measures from an -omic assay. e.g. One PSN based on similarity across entire transcriptome, one for methylome, etc.,). So in this lab exercise, the inner tier simply contains one entry, with all measures for the given layer. 
-
-This design will get more interesting in Lab 2, when we create pathway-level features. 
-
-Let's take a look at `groupList`. Here is the first tier:
-
-
-```{r, eval=TRUE}
-summary(groupList)
-```
-
-And the second tier:
-
-```{r, eval=TRUE}
-names(groupList[["BRCA_mRNAArray-20160128"]])
-length(groupList[["BRCA_mRNAArray-20160128"]][[1]])
-head(groupList[["BRCA_mRNAArray-20160128"]][[1]])
-```
-
-### Define patient similarity for each network
-The `makeNets` function tells the predictor how to create networks from provided input data.
-
-This function requires `dataList`,`groupList`, and `netDir` as input variables. The residual `...` parameter is to pass additional variables to `makePSN_NamedMatrix()`, notably `numCores` (number of parallel jobs).
-
-netDx requires that this function have:
-
-* `dataList`,`groupList`, and `netDir` as input variables. The residual `...` parameter is to pass additional variables to `makePSN_NamedMatrix()`, notably number of cores for parallel processing (`numCores`). 
-
-
-```{r,  eval=TRUE}
-makeNets <- function(dataList, groupList, netDir,...) {
-	netList <- c() # initialize before is.null() check
-	
-	layerNames <- c("BRCA_miRNASeqGene-20160128",
-		"BRCA_mRNAArray-20160128")
-	
-	for (nm in layerNames){  			## for each layer
-		if (!is.null(groupList[[nm]])){ ## must check for null for each layer
-			netList_cur <- makePSN_NamedMatrix(
-				dataList[[nm]],
-				rownames(dataList[[nm]]),	## names of measures (e.g. genes, CpGs)
-				groupList[[nm]],			## how to group measures in that layer
-				netDir,						## leave this as-is, netDx will figure out where this is.
-				verbose=FALSE, 			
-				writeProfiles=TRUE,   		## use Pearson correlation-based similarity
-				...
-				)
-
-			netList <- c(netList,netList_cur)	## just leave this in
-		}
-	}
-	return(unlist(netList))	## just leave this in 
-}
-```
-
-## Build predictor
-
-Finally we call the function that runs the netDx predictor. We provide:
-
-* patient data  (`dataList`)
-* grouping rules (`groupList`)
-* function to create PSN from data, includes choice of similarity metric (`makeNetFunc`)
-* number of train/test splits over which to collect feature scores and average performance (`numSplits`), 
-* maximum score for features in one round of feature selection  (`featScoreMax`, set to 10)
-* threshold to call feature-selected networks for each train/test split (`featSelCutoff`); only features scoring this value or higher will be used to classify test patients,
-* number of cores to use for parallel processing (`numCores`).
-
-The call below runs 10 train/test splits. 
-Within each split, it:
-
-* splits data into train/test using the default split of 80:20 (`trainProp=0.8`)
-* score networks between 0 to 10 (i.e. `featScoreMax=10L`)
-* uses networks that score >=9 out of 10 (`featSelCutoff=9L`) to classify test samples for that split.
-
-In practice a good starting point is `featScoreMax=10`, `featSelCutoff=9` and `numSplits=10L`, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples. 
-
-This step can take a few hours based on the current parameters, so we're commenting this out for the tutorial and will simply load the results.
- 
-```{r lab1-buildpredictor ,eval=TRUE}
-nco <- round(parallel::detectCores()*0.75) # use 75% available cores
-message(sprintf("Using %i of %i cores", nco, parallel::detectCores()))
-t0 <- Sys.time()
-set.seed(42) # make results reproducible
-outDir <- paste(tempdir(),randAlphanumString(),
-	"pred_output",sep=getFileSep())
-if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
-model <- suppressMessages(buildPredictor(
-	dataList=brca,			## your data
-	groupList=groupList,	## grouping strategy
-	makeNetFunc=makeNets,	## function to build PSNs
-	outDir=outDir, 			## output directory
-	trainProp=0.8,			## pct of samples to use to train model in
-							## each split
-	numSplits=2L,			## number of train/test splits
- featSelCutoff=1L,		## threshold for calling something
-							## feature-selected
-	featScoreMax=2L,		## max score for feature selection
- numCores=nco,			## set higher for parallelizing
- debugMode=FALSE,
- keepAllData=FALSE,	## set to TRUE for debugging or low-level files used by the dictor
- logging="none"
-  ))
-t1 <- Sys.time()
-print(t1-t0)
-```
-
-## Examine results
-
-Now we get model output, including performance for various train/test splits and consistently high-scoring features. 
-
-
-In the function below, we define top-scoring features as those which score two out of two in at least half of the train/test splits:
-
-```{r lab1-getresults,eval=TRUE}
-results <- getResults(model,unique(colData(brca)$STATUS),
-	featureSelCutoff=2L,featureSelPct=0.50)
-```
-
-`results` contains `performance`, `selectedFeatures` for each patient label, and the table of feature `scores`.
-
-```{r, eval=TRUE}
-summary(results)
-```
-
-Look at the performance:
-```{r, eval=TRUE}
-results$performance
-```
-
-Look at feature scores for all labels, across all train-test splits:
-
-```{r,  eval=TRUE}
-results$featureScores
-```
-
-Let's examine our confusion matrix:
-```{r, eval=TRUE}
-confMat <- confusionMatrix(model)
-```
-
-*Note: Rows of this matrix don't add up to 100% because the matrix is an average of the confusion matrices from all of the train/test splits.*
-
-And here are selected features, which are those scoring 2 out of 2 in at least half of the splits. This threshold is simply for illustration. In practice we would run at least 10 train/test splits (ideally 100+), and look for features that score 7+ out of 10 in >70% splits.
-
-```{r, eval=TRUE}
-results$selectedFeatures
-```
-
-We finally get the integrated PSN and visualize it using a tSNE plot:
-
-```{r, fig.width=8,fig.height=8, eval=TRUE}
-## this call doesn't work in Rstudio; for now we've commented this out and saved the PSN file. 
-psn <- getPSN(brca,groupList,makeNets,results$selectedFeatures)
-
-require(Rtsne)
-tsne <- tSNEPlotter(
-	psn$patientSimNetwork_unpruned, 
-	colData(brca)
-	)
-```
-
-
-## sessionInfo
-```{r}
-sessionInfo()
-```
diff --git a/vignettes/Predict_CaseControl_from_CNV.Rmd.old b/vignettes/Predict_CaseControl_from_CNV.Rmd.old
index 2ad7acb9..c48d0cf0 100755
--- a/vignettes/Predict_CaseControl_from_CNV.Rmd.old
+++ b/vignettes/Predict_CaseControl_from_CNV.Rmd.old
@@ -32,7 +32,7 @@ cnv_GR    <- GRanges(pheno$seqnames,
 pheno <- pheno[!duplicated(pheno$ID),]
 
 pathFile <- fetchPathwayDefinitions(
-	"February",2018,verbose=TRUE)
+	"February",2020,verbose=TRUE)
 pathwayList <- readPathways(pathFile)
 
 # get gene coordinates, use hg18
@@ -163,7 +163,7 @@ for the hg18 genome build are automatically fetched from a remote location, and
 
 
 ```{r,eval=TRUE}
-pathFile <- fetchPathwayDefinitions("February",2018,verbose=TRUE)
+pathFile <- fetchPathwayDefinitions("February",2020,verbose=TRUE)
 pathwayList <- readPathways(pathFile)
 
 # get gene coordinates, use hg18
diff --git a/vignettes/RawDataConversion.Rmd b/vignettes/RawDataConversion.Rmd
new file mode 100644
index 00000000..e67a2532
--- /dev/null
+++ b/vignettes/RawDataConversion.Rmd
@@ -0,0 +1,265 @@
+---
+title: "03. Creating MultiAssayExperiment input object from tables of patient data"
+author: "Indy Ng and Shraddha Pai"
+package: netDx
+date: "`r Sys.Date()`"
+output: 
+  BiocStyle::html_document:
+    toc_float: true
+vignette: >
+    %\VignetteIndexEntry{03. Creating MultiAssayExperiment input object from tables of patient data}.
+    %\VignetteEngine{knitr::rmarkdown}
+    %\VignetteEncoding{UTF-8}
+---
+
+# Introduction
+
+In this example we will build a predictor to classify breast tumours as being either of Luminal A subtype or otherwise. The process is identical for classifying three or more labels, and the example uses minimal data for quick runtime.
+
+Although the netDx algorithm requires assay data to be provided in the form of a `MultiAssayExperiment` object, the package comes equipped with the `convertToMAE()` wrapper function to transform raw experimental assay data/tables into a `MultiAssayExperiment` object. We will use data from The Cancer Genome Atlas to build the predictor, converting it from a `MultiAssayExperiment` object into a list to illustrate how to utilize the `convertToMAE()` wrapper function.
+
+We will integrate two types of -omic data:
+
+* gene expression from Agilent mRNA microarrays and
+* miRNA sequencing 
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(crop=NULL)
+```
+
+# Setup
+
+First, we load the `netDx` package.
+
+```{r,eval=TRUE}
+suppressWarnings(suppressMessages(require(netDx)))
+```
+
+# Data 
+
+For this example we pull data from the The Cancer Genome Atlas through the BioConductor `curatedTCGAData` package.
+
+```{r,eval=TRUE}
+suppressMessages(library(curatedTCGAData))
+```
+
+We fetch the two layers of data that we need:
+
+```{r, eval=TRUE}
+brca <- suppressMessages(curatedTCGAData("BRCA",
+                                         c("mRNAArray", 
+										 "miRNASeqGene"),
+                                         dry.run=FALSE, version="1.1.38"))
+```
+
+The fetch command automatically brings in a `MultiAssayExperiment` object.
+
+```{r, eval = TRUE}
+summary(brca)
+```
+## Prepare Data
+
+This next code block prepares the TCGA data. In practice you would do this once, and save the data before running netDx, but we run it here to see an end-to-end example. 
+
+```{r, eval=TRUE}
+source("prepare_data.R")
+brca <- prepareData(brca,setBinary=TRUE)
+```
+
+The important thing is to create `ID` and `STATUS` columns in the sample metadata slot. netDx uses these to get the patient identifiers and labels, respectively.
+
+```{r,eval=TRUE}
+pID <- colData(brca)$patientID
+colData(brca)$ID <- pID
+```
+
+# Create feature design rules (patient similarity networks)
+
+To build the predictor using the netDx algorithm, we call the `buildPredictor()` function which takes patient data and variable groupings, and returns a set of patient similarity networks (PSN) as an output. The user can customize what datatypes are used, how they are grouped, and what defines patient similarity for a given datatype. This is done specifically by telling the model how to:
+
+* **group** different types of data and 
+* **define similarity** for each of these (e.g. Pearson correlation, normalized difference, etc.,).
+
+The relevant input parameters are:
+
+* `groupList`: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
+* `sims`: a list specifying similarity metrics for each data layer
+
+## `groupList`: Grouping variables to define features
+
+The `groupList` object tells the predictor how to group units when constructing a network. For examples, genes may be grouped into a network representing a pathway. This object is a list; the names match those of `dataList` while each value is itself a list and reflects a potential network.
+
+In this simple example we just create a single PSN for each datatype (mRNA gene expression, and miRNA expression data), containing all measures from that datatype, where measures can be individual genes, proteins, CpG bases (in DNA methylation data), clinical variables, etc., 
+
+
+```{r, eval=TRUE}
+expr <- assays(brca)
+groupList <- list()
+for (k in 1:length(expr)) {	# loop over all layers
+	cur <- expr[[k]]; nm <- names(expr)[k]
+
+	# all measures from this layer go into our single PSN
+	groupList[[nm]] <- list(nm=rownames(cur)) 
+
+	# assign same layer name as in input data
+	names(groupList[[nm]])[1] <- nm;
+}
+```
+
+## `sims`: Define patient similarity for each network
+
+**What is this:** `sims` is used to define similarity metrics for each layer.
+This is done by providing a single list -  here, `sims` - that specifies the choice of similarity metric to use for each data layer. The `names()` for this list must match those in `groupList`. The corresponding value can either be a character if specifying a built-in similarity function, or a function. The latter is used if the user wishes to specify a custom similarity function.
+
+The current available options for built-in similarity measures are:
+
+* `pearsonCorr`: Pearson correlation (n>5 measures in set)
+* `normDiff`: normalized difference (single measure such as age)
+* `avgNormDiff`: average normalized difference (small number of measures)
+* `sim.pearscale`: Pearson correlation followed by exponential scaling
+* `sim.eucscale`: Euclidean distance followed by exponential scaling
+
+In this example, we choose Pearson correlation similarity for all data layers.
+
+```{r,eval=TRUE}
+sims <- list(a="pearsonCorr", b="pearsonCorr")
+names(sims) <- names(groupList)
+```
+
+# Conversion of raw assay data into MultiAssayExperiment format
+
+Data pulled from The Cancer Genome Atlas through the BioConductor `curatedTCGAData` package automatically fetches data in the form of a `MultiAssayExperiment` object. However, most workflows that might utilize the netDx algorithm will have experimental assay data and patient metadata in the form of data frames/matrices/tables.
+
+To facilitate ease-of-use, the netDx package has a built-in wrapper function `convertToMAE()` that takes in an input list of key-value pairs of experimental assay data and patient metadata, converting it into a `MultiAssayExperiment` object compatible with further analysis using the netDx algorithm. However, all relevant data engineering/preparation should be done before using the `convertToMAE()` wrapper function.
+
+This next code block converts the TCGA data into a list format to illustrate how one might use the `convertToMAE()` wrapper function.
+
+```{r, eval=TRUE}
+brcaData <- dataList2List(brca, groupList)
+```
+
+The keys of the input list of key-value pairs should be labelled according to the type of data corresponding to the value pairs (methylation, mRNA, proteomic, etc) and there must be a key-value pair that corresponds to patient IDs/metadata labelled `pheno`.
+
+```{r, eval=TRUE}
+brcaList <- brcaData$assays
+brcaList <- c(brcaList, list(brcaData$pheno))
+names(brcaList)[3] <- "pheno"
+```
+
+We can now call the `convertToMAE()` wrapper function to convert the list containing experimental assay data and patient metadata into a `MultiAssayExperiment` object.
+
+```{r, eval=TRUE}
+brca <- convertToMAE(brcaList)
+```
+
+We can then proceed with the rest of the netDx workflow.
+
+# Build predictor
+
+Now we're ready to train our model. netDx uses parallel processing to speed up compute time. Let's use 75% available cores on the machine for this example. netDx also throws an error if provided an output directory that already has content, so let's clean that up as well.
+  
+```{r,eval=TRUE}
+nco <- round(parallel::detectCores()*0.75) # use 75% available cores
+message(sprintf("Using %i of %i cores", nco, parallel::detectCores()))
+
+outDir <- paste(tempdir(),"pred_output",sep=getFileSep()) # use absolute path
+if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
+numSplits <- 2L
+```
+
+Finally we call the function that runs the netDx predictor. We provide:
+
+* patient data  (`dataList`)
+* grouping rules (`groupList`)
+* list specifying choice of similarity metric to use for each grouping (`sims`)
+* number of train/test splits over which to collect feature scores and average performance (`numSplits`), 
+* maximum score for features in one round of feature selection  (`featScoreMax`, set to 10)
+* threshold to call feature-selected networks for each train/test split (`featSelCutoff`); only features scoring this value or higher will be used to classify test patients,
+* number of cores to use for parallel processing (`numCores`).
+
+The call below runs two train/test splits. Within each split, it:
+
+* splits data into train/test using the default split of 80:20 (`trainProp=0.8`)
+* score networks between 0 to 2 (i.e. `featScoreMax=2L`)
+* uses networks that score >=9 out of 10 (`featSelCutoff=1L`) to classify test samples for that split.
+
+In practice a good starting point is `featScoreMax=10`, `featSelCutoff=9` and `numSplits=10L`, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples. 
+
+```{r,eval=TRUE}
+t0 <- Sys.time()
+set.seed(42) # make results reproducible
+model <- suppressMessages(
+	buildPredictor(
+		dataList=brca,			## your data
+		groupList=groupList,	## grouping strategy
+		sims = sims,
+		outDir=outDir, 			## output directory
+		trainProp=0.8,			## pct of samples to use to train model in each split
+		numSplits=2L,			## number of train/test splits
+		featSelCutoff=1L,		## threshold for calling something feature-selected
+		featScoreMax=2L,		## max score for feature selection
+		numCores=nco,			## set higher for parallelizing
+		debugMode=FALSE,
+		keepAllData=FALSE,	## set to TRUE for debugging or low-level files used by the dictor
+		logging="none"
+  ))
+t1 <- Sys.time()
+print(t1-t0)
+```
+
+# Examine results
+
+Now we get model output, including performance for various train/test splits and consistently high-scoring features. 
+
+In the function below, we define top-scoring features as those which score two out of two in at least half of the train/test splits:
+
+```{r lab1-getresults,eval=TRUE}
+results <- getResults(model,unique(colData(brca)$STATUS),
+                      featureSelCutoff=2L,featureSelPct=0.50)
+```
+
+`results` contains `performance`, `selectedFeatures` for each patient label, and the table of feature `scores`.
+
+```{r, eval=TRUE}
+summary(results)
+```
+
+Look at the performance:
+```{r, eval=TRUE}
+results$performance
+```
+
+Look at feature scores for all labels, across all train-test splits:
+
+```{r,  eval=TRUE}
+results$featureScores
+```
+
+Let's examine our confusion matrix:
+```{r, eval=TRUE}
+confMat <- confusionMatrix(model)
+```
+
+*Note: Rows of this matrix don't add up to 100% because the matrix is an average of the confusion matrices from all of the train/test splits.*
+
+And here are selected features, which are those scoring 2 out of 2 in at least half of the splits. This threshold is simply for illustration. In practice we would run at least 10 train/test splits (ideally 100+), and look for features that score 7+ out of 10 in >70% splits.
+
+```{r, eval=TRUE}
+results$selectedFeatures
+```
+We finally get the integrated PSN and visualize it using a tSNE plot:
+
+```{r, fig.width=8,fig.height=8, eval=TRUE}
+## this call doesn't work in Rstudio; for now we've commented this out and saved the PSN file. 
+psn <- getPSN(brca,groupList,sims = sims,selectedFeatures=results$selectedFeatures)
+require(Rtsne)
+tsne <- tSNEPlotter(
+	psn$patientSimNetwork_unpruned, 
+	colData(brca)
+	)
+```
+
+# sessionInfo
+```{r}
+sessionInfo()
+```
\ No newline at end of file
diff --git a/vignettes/RunPredictorWorkflow.Rmd b/vignettes/RunPredictorWorkflow.Rmd
new file mode 100755
index 00000000..9795535b
--- /dev/null
+++ b/vignettes/RunPredictorWorkflow.Rmd
@@ -0,0 +1,433 @@
+---
+title: "01. Build & test classifier with clinical and multi-omic data & pathway features"
+author: "Shraddha Pai"
+package: netDx
+date: "`r Sys.Date()`"
+output: 
+  BiocStyle::html_document:
+    toc_float: true
+vignette: >
+    %\VignetteIndexEntry{01. Build & test classifier with clinical and multi-omic data & pathway features}
+    %\VignetteEngine{knitr::rmarkdown}
+    %\VignetteEncoding{UTF-8}
+---
+
+
+
+# Introduction
+
+In this example, we will use clinical data and three types of 'omic data for binary classification of breast tumours. We also use several strategies and definitions of similarity to create features.
+
+For this we will use data from the  The Cancer Genome Atlas, and will integrate four types of -omic data: 
+
+* gene expression from Agilent mRNA microarrays
+* DNA methylation (Illumina HumanMethylation 27K microarrays))
+* proteomic measures from reverse-phase protein arrays, and 
+* miRNA sequencing 
+
+
+Figure 1 shows the rules for converting patient data into similarity networks, which serve as units of input (or "features") for the model.
+
+* Gene expression: Features are defined at the level of ***pathways***; i.e. a feature groups genes corresponding to the pathway. Similarity is defined as pairwise *Pearson correlation*
+ * Clinical variables: Each *variable* is its own feature and similarity is defined as *normalized difference*. 
+ * Proteomic and methylation data: Features are defined at the level of the entire *data layer*; a single feature is created for all of proteomic data, and the same for methylation. Similarity is defined by pairwise *Pearson correlation*
+
+
+```{r,eval=TRUE,echo=FALSE,out.width="100%",fig.cap="Predictor design."}
+knitr::opts_chunk$set(crop=NULL)
+knitr::include_graphics("images/vignette1_design.jpg")
+```
+ 
+ The overall workflow for building and testing the predictor is shown in Figure 2.  
+ 
+ We start with a dataset with 327 tumours. 90% of the samples are used to build the predictor (`buildPredictor()`) while 10% are randomly subsampled and set aside for independent validation (`subsampleValidationData()`). Building the predictor involves:
+
+ 1. Splitting samples into 80% training and 20% test samples (proportions can be changed by setting parameters for `buildPredictor()`)
+ 2. Running feature selection using the training samples, so that features are scored from 0 to some user-specified max value.
+ 3. Features passing cutoff are used to classify the 20% test samples.
+ 
+ This process is repeated with multiple random train/test splits to generate an average performance of the model on training data. Features with a consistent high score across all train/test splits are selected to build a final model.
+
+ The model is then evaluated on the independent validation set (here, the held-out 10% of samples), using only the consistently high scoring features (i.e. features passing selection). This steps ascertains the model's test set performance.
+
+
+```{r,eval=TRUE,echo=FALSE,out.width="80%",fig.cap="Workflow."}
+knitr::include_graphics("images/vignette_workflow.png")
+``` 
+
+In this example, we build a minimal predictor to ensure the vignette builds in a feasible time. Here we use two train/test splits, score features from 0-2, call features scoring >0 selected features within a given train/test split. For building the final model, we choose features that score 1 or higher in at least half the splits. These are not realistic parameters because they do not sufficiently resample training data to ensure generalizability.
+
+In practice reasonable values include 10 train/test splits, features scored at least out of 10, and the final model being built with features scoring 7 or higher. Try several designs to see which consistently generalizes to an independent test set (hyperparameter tuning).
+
+# Setup 
+Load the `netDx` package.
+
+```{r,eval=TRUE}
+suppressWarnings(suppressMessages(require(netDx)))
+```
+
+# Get and Prepare Data 
+
+In this example, we use curated multi-modal data from The Cancer Genome Atlas, gotten from the BioConductor `curatedTCGAData` package. Data for all cancer types profiled in TCGA are available through this package; [see this tutorial for details](https://bioconductor.org/packages/release/data/experiment/vignettes/curatedTCGAData/inst/doc/curatedTCGAData.html). 
+
+```{r,eval=TRUE}
+suppressMessages(library(curatedTCGAData))
+```
+
+Let's take a look at the available data for breast cancer, without downloading any (set `dry.run=TRUE`). 
+Note that from BioC 3.13 onwards, users may fetch one of two versions of TCGA data.
+
+```{r,eval=TRUE}
+curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE,version="1.1.38")
+```
+
+In this call we fetch only the gene expression, proteomic and methylation data; setting `dry.run=FALSE` initiates the fetching of the data.
+
+```{r,eval=TRUE}
+brca <- suppressMessages(
+   curatedTCGAData("BRCA",
+               c("mRNAArray","miRNA*","Methylation_methyl27*"),
+	dry.run=FALSE,version="1.1.38"))
+```
+
+This call returns a `MultiAssayExperiment` object. Recall that this is a container for storing multiple assays performed on the same set of samples. [See this tutorial](https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/QuickStartMultiAssay.html) to learn more.
+
+## MultiAssayExperiment object
+Skip this section if you are familiar with MultiAssayExperiment objects.
+
+Let's briefly explore the `brca` `MultiAssayExperiment` object.
+
+```{r, class.source="codeblock",eval=TRUE}
+brca
+```
+
+`assays()` returns a `list` with all -omic data associated with this object.  
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(assays(brca))
+```
+`names()` shows the datatypes in each slot of `assays()`:
+
+```{r, class.source="codeblock",eval=TRUE}
+names(assays(brca))
+```
+
+So miRNA data is in slot 1, gene expression in slot 2, etc., 
+
+We can subset the data to see what it looks like. Let's do that for the miRNA data, looking at just the first five measures
+
+```{r, class.source="codeblock",eval=TRUE}
+mir <- assays(brca)[["BRCA_miRNASeqGene-20160128"]]
+head(mir[,1:5])
+```
+
+Patient metadata is contained in the `colData()` slot. Rows contain data for each patient and columns contain measures such as clinical characteristics:
+
+```{r, class.source="codeblock",eval=TRUE}
+pheno <- colData(brca)
+colnames(pheno)[1:20]
+head(pheno[,1:5])
+```
+
+## Prepare data
+This next code block prepares the TCGA data. This includes:
+
+* removing duplicate samples
+* reformatting patient IDs (e.g. removing spaces and hyphens)
+* creating an `ID` column in `colData(brca)`, which contains unique patient IDs
+* creating a `STATUS` column in `colData(brca)` which contains the patient labels (i.e what we want netDx to classify). 
+
+
+In practice you would prepare the dataset once and save it to file, then separately load it before running netDx; i.e. decouple data processing and running the predictor. The data processing code has been moved into a supporting file, `prepare_data.R`. You can explore it after the lab to see how some things are achieved (e.g. removing duplicate samples). For now, let's just run it.
+
+
+```{r,eval=TRUE}
+source("prepare_data.R")
+brca <- prepareData(brca,setBinary=TRUE)
+```
+
+The important thing is to create `ID` and `STATUS` columns in the sample metadata slot. netDx uses these to get the patient identifiers and labels, respectively.
+
+```{r,eval=TRUE}
+pID <- colData(brca)$patientID
+colData(brca)$ID <- pID
+```
+
+## Holdout validation set
+
+The function `subSampleValidationData` partitions the TCGA data into two smaller datasets: a training and holdout validation dataset. This is to facilitate model validation after an initial model is built by the netDx algorithm - netDx will train a model on the training dataset through the `buildPredictor()` call, and this model can validate the held-out validation dataset through the `predict()` function call.
+
+```{r,eval=TRUE}
+set.seed(123)
+dsets <- subsampleValidationData(brca,pctValidation=0.1)
+brca <- dsets$trainMAE
+holdout <- dsets$validationMAE
+```
+
+# Rules to create features (patient similarity networks)
+
+Now let's set up the data for input to netDx. 
+
+netDx allows the user to define how data is converted into patient similarity networks (or PSNs), which are the features that go into the model. This is done specifically by telling the model how to:
+
+* **group** different types of data and 
+* **define similarity** for each of these (e.g. Pearson correlation, normalized difference, etc.,).
+
+The relevant input parameters are:
+
+* `groupList`: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
+* `sims`: a list specifying similarity metrics for each data layer
+
+### groupList: Grouping variables to define features
+**What is this:** `groupList` tells netDx how to group measures within a layer, to generate a PSN. Measures could be individual genes, proteins, CpG bases (in DNA methylation data), clinical variables, etc., 
+
+In this simple example we just create a single PSN for each datatype, containing all measures from that datatype.
+
+In the code below, we fetch pathway definitions for January 2021 from [https://downloads.res.oicr.on.ca/pailab/](https://downloads.res.oicr.on.ca/pailab/) and group gene expression data by pathways. To keep this example short, we limit to only three pathways, but in practice you would use all pathways meeting a size criterion; e.g. those containing between 10 and 500 genes (see the `MIN_SIZE` and `MAX_SIZE` parameters of `readPathways()`). 
+
+```{r,eval=TRUE}
+groupList <- list()
+
+# genes in mRNA data are grouped by pathways
+pathFile <- sprintf("%s/extdata/pathway_ex3.gmt", path.package("netDx"))
+pathList <- suppressMessages(readPathways(pathFile))
+groupList[["BRCA_mRNAArray-20160128"]] <- pathList
+```
+
+Let's take a look at `groupList`. Here is the first tier, which currently only has gene expression data. You can see that `groupList` has three features for gene expression data (`Length` is `3`).
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(groupList)
+```
+
+Now we look at what comprises the pathway-features. Let's look at the names of the pathways:
+
+```{r, class.source="codeblock",eval=TRUE}
+names(groupList[["BRCA_mRNAArray-20160128"]])
+```
+
+How many genes are in the first pathway? Take a look at the genes in the pathway, using `head()`
+```{r,eval=TRUE}
+length(groupList[["BRCA_mRNAArray-20160128"]][[1]])
+```
+
+```{r,eval=TRUE}
+head(groupList[["BRCA_mRNAArray-20160128"]][[1]])
+```
+
+For clinical data, we do not group variables. Rather, we create one feature each for two variables:
+
+```{r,eval=TRUE}
+groupList[["clinical"]] <- list(
+    age="patient.age_at_initial_pathologic_diagnosis",
+	  stage="STAGE"
+)
+```
+
+For miRNA sequencing, methylation, and proteomic data we create one feature each, where each feature contains all measures for that data type.
+
+```{r,eval=TRUE}
+tmp <- list(rownames(experiments(brca)[[1]]));
+names(tmp) <- names(brca)[1]
+groupList[[names(brca)[[1]]]] <- tmp
+
+tmp <- list(rownames(experiments(brca)[[3]]));
+names(tmp) <- names(brca)[3]
+groupList[[names(brca)[3]]] <- tmp
+```
+
+### `sims`: Define patient similarity for each network
+
+**What is this:** `sims` is used to define similarity metrics for each layer.
+This is done by providing a single list -  here, `sims` - that specifies the choice of similarity metric to use for each data layer. The `names()` for this list must match those in `groupList`. The corresponding value can either be a character if specifying a built-in similarity function, or a function. The latter is used if the user wishes to specify a custom similarity function.
+
+The current available options for built-in similarity measures are:
+
+* `pearsonCorr`: Pearson correlation (n>5 measures in set)
+* `normDiff`: normalized difference (single measure such as age)
+* `avgNormDiff`: average normalized difference (small number of measures)
+* `sim.pearscale`: Pearson correlation followed by exponential scaling
+* `sim.eucscale`: Euclidean distance followed by exponential scaling
+
+In this example, we choose Pearson correlation similarity for all data layers except for the single-variable features in the `clinical` layer. For that we use normalized difference.
+
+```{r,eval=TRUE}
+sims <- list(
+  "BRCA_mRNAArray-20160128"="pearsonCorr",
+  clinical="normDiff",
+  "BRCA_miRNASeqGene-20160128"="pearsonCorr",
+  "BRCA_Methylation_methyl27-20160128"="pearsonCorr"
+  )
+
+```
+
+# Build predictor
+
+Now we're ready to train our model. netDx uses parallel processing to speed up compute time. Let's use 75% available cores on the machine for this example. netDx also throws an error if provided an output directory that already has content, so let's clean that up as well.
+  
+```{r,eval=TRUE}
+nco <- round(parallel::detectCores()*0.75) # use 75% available cores
+message(sprintf("Using %i of %i cores", nco, parallel::detectCores()))
+
+outDir <- paste(tempdir(),"pred_output",sep=getFileSep()) # use absolute path
+if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
+numSplits <- 2L
+```
+
+Finally we call the function that builds the predictor. We provide:
+
+* patient data  (`dataList`)
+* grouping rules (`groupList`)
+* function to create PSN from data, includes choice of similarity metric (`makeNetFunc`)
+* number of train/test splits over which to collect feature scores and average performance (`numSplits`), 
+* maximum score for features in one round of feature selection  (`featScoreMax`, set to 10)
+* threshold to call feature-selected networks for each train/test split (`featSelCutoff`); only features scoring this value or higher will be used to classify test patients,
+* number of cores to use for parallel processing (`numCores`).
+
+The call below runs two train/test splits. Within each split, it:
+
+* splits data into train/test using the default split of 80:20 (`trainProp=0.8`)
+* score networks between 0 to 2 (i.e. `featScoreMax=2L`)
+* uses networks that score >=1 out of 2 (`featSelCutoff=1L`) to classify test samples for that split.
+
+In practice a good starting point is `featScoreMax=10`, `featSelCutoff=9` and `numSplits=10L`, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples.
+
+```{r,eval=TRUE}
+t0 <- Sys.time()
+model <- suppressMessages(
+buildPredictor(
+	dataList=brca,			## your data
+	groupList=groupList,	## grouping strategy
+	sims=sims,
+	outDir=outDir, 			## output directory
+	trainProp=0.8,			## pct of samples to use to train model in
+							    ## each split
+	numSplits=2L,			 ## number of train/test splits
+ 	featSelCutoff=1L,		## threshold for calling something
+							    ## feature-selected
+	featScoreMax=2L,	## max score for feature selection
+ numCores=nco,			  ## set higher for parallelizing
+ debugMode=FALSE,
+ keepAllData=FALSE,	    ## set to TRUE for debugging
+ logging="none"     ## set to "default" for messages
+  ))
+t1 <- Sys.time()
+print(t1-t0)
+```
+
+# Examine results
+
+We now use `getResults()` to fetch the model performance for the various train/test splits as well as feature scores:
+
+```{r,eval=TRUE}
+results <- getResults(
+    model,
+    unique(colData(brca)$STATUS),
+    featureSelCutoff=2L,
+    featureSelPct=0.50
+  )
+```
+
+`results` contains `performance`, `selectedFeatures` for each patient label, and the table of feature `scores`.
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(results)
+```
+
+Look at the performance:
+```{r, class.source="codeblock",eval=TRUE}
+results$performance
+```
+
+Look at feature scores for all labels, across all train-test splits:
+
+```{r, class.source="codeblock", eval=TRUE}
+results$featureScores
+```
+
+Let's examine our confusion matrix:
+```{r, class.source="codeblock",eval=TRUE}
+confMat <- confusionMatrix(model)
+```
+
+*Note: Rows of this matrix don't add up to 100% because the matrix is an average of the confusion matrices from all of the train/test splits.*
+
+And here are selected features, which are those scoring 2 out of 2 in at least half of the splits. This threshold is simply for illustration. In practice we would run at least 10 train/test splits (ideally 100+), and look for features that score 7+ out of 10 in >70% splits.
+
+```{r, class.source="codeblock",eval=TRUE}
+results$selectedFeatures
+```
+
+# Validate on independent samples
+Now we use `predict()` to classify samples in the independent dataset. We provide the model with feature design rules in `groupList`, the list of selected features to use in `featSelNet`, the function to convert data into patient similarity networks in `makeNets`, as well as the original and validated datasets in `brca` and `holdout` respectively.
+
+The training data needs to be provided because netDx creates a single patient similarity network with both training and test data. It then uses label propagation to "diffuse" patient labels from training samples to test samples, and labels the latter based on which class they are most similar to.
+
+```{r,eval=TRUE}
+outDir <- paste(tempdir(), randAlphanumString(), 
+  sep = getFileSep())
+if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
+dir.create(outDir)
+
+predModel <- suppressMessages(
+  predict(trainMAE=brca, testMAE=holdout, 
+    groupList=groupList, 
+    selectedFeatures=results$selectedFeatures, 
+    sims=sims,
+    outDir=outDir, verbose = FALSE)
+)
+```
+
+# Plot results of validation
+
+Finally we examine how well our model performed, using `getPerformance()`. 
+
+Compute performance:
+```{r,eval=TRUE}
+perf <- getPerformance(predModel, 
+  unique(colData(brca)$STATUS))
+summary(perf)
+```
+
+We plot the AUROC and AUPR curves using `plotPerf_multi()`. In this example we get perfect separation of the two classes.
+
+```{r,eval=TRUE}
+plotPerf_multi(list(perf$rocCurve),
+  plotTitle = sprintf(
+    "BRCA Validation: %i samples", 
+    nrow(colData(holdout))))
+plotPerf_multi(list(perf$prCurve), 
+  plotType = "PR",
+  plotTitle = sprintf(
+    "BRCA Validation: %i samples", 
+    nrow(colData(holdout))))
+
+```
+
+## Integrated patient similarity network
+We finally get the integrated PSN and visualize it using a tSNE plot:
+
+```{r, class.source="codeblock",fig.width=8,fig.height=8, eval=TRUE}
+## this call doesn't work in Rstudio; for now we've commented this out and saved the PSN file. 
+psn <- suppressMessages(getPSN(
+  brca,
+  groupList,
+  sims=sims,
+  selectedFeatures=results$selectedFeatures
+))
+```
+
+We can plot a lower dimensional representation of the patient similarities using a tSNE plot. This call requires that you install the `Rtsne` package:
+
+```{r}
+library(Rtsne)
+tsne <- tSNEPlotter(
+	psn$patientSimNetwork_unpruned, 
+	colData(brca)
+	)
+```
+
+# sessionInfo
+```{r,eval=TRUE}
+sessionInfo()
+```
diff --git a/vignettes/ThreeWayClassifier.Rmd b/vignettes/ThreeWayClassifier.Rmd
index 4b532ba7..6f1cf0ea 100755
--- a/vignettes/ThreeWayClassifier.Rmd
+++ b/vignettes/ThreeWayClassifier.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "Build N-way classifier (N>2) from clinical and multi-omic data"
+title: "01. Build & test classifier with clinical and multi-omic data & pathway features"
 author: "Shraddha Pai"
 package: netDx
 date: "`r Sys.Date()`"
@@ -7,7 +7,7 @@ output:
   BiocStyle::html_document:
     toc_float: true
 vignette: >
-    %\VignetteIndexEntry{02. Build three-way classifier (N-way; N>2) from multi-omic data}
+    %\VignetteIndexEntry{01. Build & test classifier with clinical and multi-omic data & pathway features}
     %\VignetteEngine{knitr::rmarkdown}
     %\VignetteEncoding{UTF-8}
 ---
@@ -16,27 +16,67 @@ vignette: >
 
 # Introduction
 
-In this example, we will use clinical data and three types of 'omic data - gene expression, DNA methylation and proteomic data - for binary classification of breast tumours. We also use several strategies and definitions of similarity to create features. 
+In this example, we will use clinical data and three types of 'omic data for binary classification of breast tumours. We also use several strategies and definitions of similarity to create features.
 
- * Clinical variables: Each *variable* is its own feature (e.g. age); similarity is defined as *normalized difference*. 
- * Gene expression: Features are defined at the level of ***pathways***; i.e. a feature groups genes corresponding to the pathway. Similarity is defined as pairwise *Pearson correlation*
+For this we will use data from the  The Cancer Genome Atlas, and will integrate four types of -omic data: 
+
+* gene expression from Agilent mRNA microarrays
+* DNA methylation (Illumina HumanMethylation 27K microarrays))
+* proteomic measures from reverse-phase protein arrays, and 
+* miRNA sequencing 
+
+
+Figure 1 shows the rules for converting patient data into similarity networks, which serve as units of input (or "features") for the model.
+
+* Gene expression: Features are defined at the level of ***pathways***; i.e. a feature groups genes corresponding to the pathway. Similarity is defined as pairwise *Pearson correlation*
+ * Clinical variables: Each *variable* is its own feature and similarity is defined as *normalized difference*. 
  * Proteomic and methylation data: Features are defined at the level of the entire *data layer*; a single feature is created for all of proteomic data, and the same for methylation. Similarity is defined by pairwise *Pearson correlation*
 
-# Setup
+
+```{r,eval=TRUE,echo=FALSE,out.width="100%",fig.cap="Predictor design."}
+knitr::opts_chunk$set(crop=NULL)
+knitr::include_graphics("vignette_psn.png")
+```
+ 
+ The overall workflow for building and testing the predictor is shown in Figure 2.  
+ 
+ We start with a dataset with 295 tumours. 90% of the samples are used to build the predictor (`buildPredictor()`) while 10% are randomly subsampled and set aside for independent validation (`subsampleValidationData()`). Building the predictor involves:
+
+ 1. Splitting samples into 80% training and 20% test samples (proportions can be changed by setting parameters for `buildPredictor()`)
+ 2. Running feature selection using the training samples, so that features are scored from 0 to some user-specified max value.
+ 3. Features passing cutoff are used to classify the 20% test samples.
+ 
+ This process is repeated with multiple random train/test splits to generate an average performance of the model on training data. Features with a consistent high score across all train/test splits are selected to build a final model.
+
+ The model is then evaluated on the independent validation set (here, the held-out 10% of samples), using only the consistently high scoring features (i.e. features passing selection). This steps ascertains the model's test set performance.
+
+
+```{r,eval=TRUE,echo=FALSE,out.width="80%",fig.cap="Workflow."}
+knitr::include_graphics("vignette_workflow.png")
+``` 
+
+In this example, we build a minimal predictor to ensure the vignette builds in a feasible time. Here we use two train/test splits, score features from 0-2, call features scoring >0 selected features within a given train/test split. For building the final model, we choose features that score 1 or higher in at least half the splits. These are not realistic parameters because they do not sufficiently resample training data to ensure generalizability.
+
+In practice reasonable values include 10 train/test splits, features scored at least out of 10, and the final model being built with features scoring 7 or higher. Try several designs to see which consistently generalizes to an independent test set (hyperparameter tuning).
+
+# Setup 
 Load the `netDx` package.
 
 ```{r,eval=TRUE}
 suppressWarnings(suppressMessages(require(netDx)))
 ```
 
-# Data 
+# Get and Prepare Data 
+
+In this example, we use curated multi-modal data from The Cancer Genome Atlas, gotten from the BioConductor `curatedTCGAData` package. Data for all cancer types profiled in TCGA are available through this package; [see this tutorial for details](https://bioconductor.org/packages/release/data/experiment/vignettes/curatedTCGAData/inst/doc/curatedTCGAData.html). 
 
-For this example we pull data from the The Cancer Genome Atlas through the BioConductor `curatedTCGAData` package. The fetch command automatically brings in a `MultiAssayExperiment` object. 
 ```{r,eval=TRUE}
 suppressMessages(library(curatedTCGAData))
 ```
 
-We use the `curatedTCGAData()` command to look at available assays in the breast cancer dataset. 
+Let's take a look at the available data for breast cancer, without downloading any (set `dry.run=TRUE`). 
+Note that from BioC 3.13 onwards, users may fetch one of two versions of TCGA data.
+
 ```{r,eval=TRUE}
 curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE,version="1.1.38")
 ```
@@ -46,11 +86,60 @@ In this call we fetch only the gene expression, proteomic and methylation data;
 ```{r,eval=TRUE}
 brca <- suppressMessages(
    curatedTCGAData("BRCA",
-               c("mRNAArray","RPPA*","Methylation_methyl27*"),
+               c("mRNAArray","miRNA*","Methylation_methyl27*"),
 	dry.run=FALSE,version="1.1.38"))
 ```
 
-This next code block prepares the TCGA data. In practice you would do this once, and save the data before running netDx, but we run it here to see an end-to-end example. 
+This call returns a `MultiAssayExperiment` object. Recall that this is a container for storing multiple assays performed on the same set of samples. [See this tutorial](https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/QuickStartMultiAssay.html) to learn more.
+
+## MultiAssayExperiment object
+Skip this section if you are familiar with MultiAssayExperiment objects.
+
+Let's briefly explore the `brca` `MultiAssayExperiment` object.
+
+```{r, class.source="codeblock",eval=TRUE}
+brca
+```
+
+`assays()` returns a `list` with all -omic data associated with this object.  
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(assays(brca))
+```
+`names()` shows the datatypes in each slot of `assays()`:
+
+```{r, class.source="codeblock",eval=TRUE}
+names(assays(brca))
+```
+
+So miRNA data is in slot 1, gene expression in slot 2, etc., 
+
+We can subset the data to see what it looks like. Let's do that for the miRNA data, looking at just the first five measures
+
+```{r, class.source="codeblock",eval=TRUE}
+mir <- assays(brca)[["BRCA_miRNASeqGene-20160128"]]
+head(mir[,1:5])
+```
+
+Patient metadata is contained in the `colData()` slot. Rows contain data for each patient and columns contain measures such as clinical characteristics:
+
+```{r, class.source="codeblock",eval=TRUE}
+pheno <- colData(brca)
+colnames(pheno)[1:20]
+head(pheno[,1:5])
+```
+
+## Prepare data
+This next code block prepares the TCGA data. This includes:
+
+* removing duplicate samples
+* reformatting patient IDs (e.g. removing spaces and hyphens)
+* creating an `ID` column in `colData(brca)`, which contains unique patient IDs
+* creating a `STATUS` column in `colData(brca)` which contains the patient labels (i.e what we want netDx to classify). 
+
+
+In practice you would prepare the dataset once and save it to file, then separately load it before running netDx; i.e. decouple data processing and running the predictor. The data processing code has been moved into a supporting file, `prepare_data.R`. You can explore it after the lab to see how some things are achieved (e.g. removing duplicate samples). For now, let's just run it.
+
 
 ```{r,eval=TRUE}
 source("prepare_data.R")
@@ -64,28 +153,84 @@ pID <- colData(brca)$patientID
 colData(brca)$ID <- pID
 ```
 
+## Holdout validation set
+
+The function `subSampleValidationData` partitions the TCGA data into two smaller datasets: a training and holdout validation dataset. This is to facilitate model validation after an initial model is built by the netDx algorithm - netDx will train a model on the training dataset through the `buildPredictor()` call, and this model can validate the held-out validation dataset through the `predict()` function call.
+
+```{r,eval=TRUE}
+set.seed(123)
+dsets <- subsampleValidationData(brca,pctValidation=0.1)
+brca <- dsets$trainMAE
+holdout <- dsets$validationMAE
+```
+
 # Rules to create features (patient similarity networks)
 
-Our plan is to group gene expression data by pathways and clinical data by single variables. We will treat methylation and proteomic data each as a single feature, so each of those groups will contain the entire input table for those corresponding data types. 
+Now let's set up the data for input to netDx. 
+
+netDx allows the user to define how data is converted into patient similarity networks (or PSNs), which are the features that go into the model. This is done specifically by telling the model how to:
+
+* **group** different types of data and 
+* **define similarity** for each of these (e.g. Pearson correlation, normalized difference, etc.,).
+
+The relevant input parameters are:
 
-In the code below, we fetch pathway definitions for January 2018 from (http://download.baderlab.org/EM_Genesets) and group gene expression data by pathways. To keep the example short, we limit to only three pathways, but in practice you would use all pathways meeting a size criterion; e.g. those containing between 10 and 500 genes. 
+* `groupList`: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
+* `sims`: a list specifying similarity metrics for each data layer
 
-Grouping rules are accordingly created for the clinical, methylation and proteomic data. 
+### groupList: Grouping variables to define features
+**What is this:** `groupList` tells netDx how to group measures within a layer, to generate a PSN. Measures could be individual genes, proteins, CpG bases (in DNA methylation data), clinical variables, etc., 
+
+In this simple example we just create a single PSN for each datatype, containing all measures from that datatype.
+
+In the code below, we fetch pathway definitions for January 2021 from [https://downloads.res.oicr.on.ca/pailab/](https://downloads.res.oicr.on.ca/pailab/) and group gene expression data by pathways. To keep this example short, we limit to only three pathways, but in practice you would use all pathways meeting a size criterion; e.g. those containing between 10 and 500 genes (see the `MIN_SIZE` and `MAX_SIZE` parameters of `readPathways()`). 
 
 ```{r,eval=TRUE}
 groupList <- list()
 
 # genes in mRNA data are grouped by pathways
 pathFile <- sprintf("%s/extdata/pathway_ex3.gmt", path.package("netDx"))
-pathList <- readPathways(pathFile)
+pathList <- suppressMessages(readPathways(pathFile))
 groupList[["BRCA_mRNAArray-20160128"]] <- pathList
-# clinical data is not grouped; each variable is its own feature
+```
+
+Let's take a look at `groupList`. Here is the first tier, which currently only has gene expression data. You can see that `groupList` has three features for gene expression data (`Length` is `3`).
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(groupList)
+```
+
+Now we look at what comprises the pathway-features. Let's look at the names of the pathways:
+
+```{r, class.source="codeblock",eval=TRUE}
+names(groupList[["BRCA_mRNAArray-20160128"]])
+```
+
+How many genes are in the first pathway? Take a look at the genes in the pathway, using `head()`
+```{r,eval=TRUE}
+length(groupList[["BRCA_mRNAArray-20160128"]][[1]])
+```
+
+```{r,eval=TRUE}
+head(groupList[["BRCA_mRNAArray-20160128"]][[1]])
+```
+
+For clinical data, we do not group variables. Rather, we create one feature each for two variables:
+
+```{r,eval=TRUE}
 groupList[["clinical"]] <- list(
-      age="patient.age_at_initial_pathologic_diagnosis",
-	   stage="STAGE"
+    age="patient.age_at_initial_pathologic_diagnosis",
+	  stage="STAGE"
 )
-# for methylation generate one feature containing all probes
-# same for proteomics data
+```
+
+For miRNA sequencing, methylation, and proteomic data we create one feature each, where each feature contains all measures for that data type.
+
+```{r,eval=TRUE}
+tmp <- list(rownames(experiments(brca)[[1]]));
+names(tmp) <- names(brca)[1]
+groupList[[names(brca)[[1]]]] <- tmp
+
 tmp <- list(rownames(experiments(brca)[[2]]));
 names(tmp) <- names(brca)[2]
 groupList[[names(brca)[2]]] <- tmp
@@ -95,116 +240,197 @@ names(tmp) <- names(brca)[3]
 groupList[[names(brca)[3]]] <- tmp
 ```
 
-## Define patient similarity for each network
+### `sims`: Define patient similarity for each network
 
-We provide `netDx` with a custom function to generate similarity networks (i.e. features). The first block tells netDx to generate correlation-based networks using everything but the clinical data. This is achieved by the call:
-```{r,eval=FALSE}
-makePSN_NamedMatrix(..., writeProfiles=TRUE,...)`
-```
+**What is this:** `sims` is used to define similarity metrics for each layer.
+This is done by providing a single list -  here, `sims` - that specifies the choice of similarity metric to use for each data layer. The `names()` for this list must match those in `groupList`. The corresponding value can either be a character if specifying a built-in similarity function, or a function. The latter is used if the user wishes to specify a custom similarity function.
 
-The second block makes a different call to `makePSN_NamedMatrix()` but this time, requesting the use of the normalized difference similarity metric. This is achieved by calling:
-```{r,eval=FALSE}
-   makePSN_NamedMatrix(,..., 
-                       simMetric="custom", customFunc=normDiff,
-                       writeProfiles=FALSE)
-```
+The current available options for built-in similarity measures are:
 
-`normDiff` is a function provided in the `netDx` package, but the user may define custom similarity functions in this block of code and pass those to `makePSN_NamedMatrix()`, using the `customFunc` parameter.
+* `pearsonCorr`: Pearson correlation (n>5 measures in set)
+* `normDiff`: normalized difference (single measure such as age)
+* `avgNormDiff`: average normalized difference (small number of measures)
+* `sim.pearscale`: Pearson correlation followed by exponential scaling
+* `sim.eucscale`: Euclidean distance followed by exponential scaling
 
-```{r,eval=TRUE}
-makeNets <- function(dataList, groupList, netDir,...) {
-	netList <- c() # initialize before is.null() check
-	# correlation-based similarity for mRNA, RPPA and methylation data
-	# (Pearson correlation)
-	for (nm in setdiff(names(groupList),"clinical")) {
-	   # NOTE: the check for is.null() is important!
-		if (!is.null(groupList[[nm]])) {
-		netList <- makePSN_NamedMatrix(dataList[[nm]],
-		             rownames(dataList[[nm]]),
-                   groupList[[nm]],netDir,verbose=FALSE,
-		             writeProfiles=TRUE,...) 
-		}
-	}
-	
-	# make clinical nets (normalized difference)
-	netList2 <- c()
-	if (!is.null(groupList[["clinical"]])) {
-	netList2 <- makePSN_NamedMatrix(dataList$clinical, 
-		rownames(dataList$clinical),
-		groupList[["clinical"]],netDir,
-		simMetric="custom",customFunc=normDiff, # custom function
-		writeProfiles=FALSE,
-		sparsify=TRUE,verbose=TRUE,...)
-	}
-	netList <- c(unlist(netList),unlist(netList2))
-	return(netList)
-}
+In this example, we choose Pearson correlation similarity for all data layers except for the single-variable features in the `clinical` layer. For that we use normalized difference.
 
+```{r,eval=TRUE}
+sims <- list(
+  a="pearsonCorr",
+  b="normDiff",
+  c="pearsonCorr",
+  d="pearsonCorr"
+  )
+
+# map layer names to sims
+names(sims) <- names(groupList)
 ```
 
 # Build predictor
 
-Finally we make the call to build the predictor. 
- 
+Now we're ready to train our model. netDx uses parallel processing to speed up compute time. Let's use 75% available cores on the machine for this example. netDx also throws an error if provided an output directory that already has content, so let's clean that up as well.
+  
 ```{r,eval=TRUE}
-set.seed(42) # make results reproducible
-
-# location for intermediate work
-# set keepAllData to TRUE to not delete at the end of the 
-# predictor run.
-# This can be useful for debugging.
 nco <- round(parallel::detectCores()*0.75) # use 75% available cores
 message(sprintf("Using %i of %i cores", nco, parallel::detectCores()))
+
 outDir <- paste(tempdir(),"pred_output",sep=getFileSep()) # use absolute path
+if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
 numSplits <- 2L
-out <- suppressMessages(
-   buildPredictor(dataList=brca,groupList=groupList,
-      makeNetFunc=makeNets,
-      outDir=outDir, ## netDx requires absolute path
-      numSplits=numSplits, featScoreMax=2L, featSelCutoff=1L,
-	  numCores=nco)
+```
+
+Finally we call the function that builds the predictor. We provide:
+
+* patient data  (`dataList`)
+* grouping rules (`groupList`)
+* function to create PSN from data, includes choice of similarity metric (`makeNetFunc`)
+* number of train/test splits over which to collect feature scores and average performance (`numSplits`), 
+* maximum score for features in one round of feature selection  (`featScoreMax`, set to 10)
+* threshold to call feature-selected networks for each train/test split (`featSelCutoff`); only features scoring this value or higher will be used to classify test patients,
+* number of cores to use for parallel processing (`numCores`).
+
+The call below runs two train/test splits. Within each split, it:
+
+* splits data into train/test using the default split of 80:20 (`trainProp=0.8`)
+* score networks between 0 to 2 (i.e. `featScoreMax=2L`)
+* uses networks that score >=1 out of 2 (`featSelCutoff=1L`) to classify test samples for that split.
+
+In practice a good starting point is `featScoreMax=10`, `featSelCutoff=9` and `numSplits=10L`, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples.
+
+```{r,eval=TRUE}
+t0 <- Sys.time()
+model <- suppressMessages(
+buildPredictor(
+	dataList=brca,			## your data
+	groupList=groupList,	## grouping strategy
+	sims=sims,
+	outDir=outDir, 			## output directory
+	trainProp=0.8,			## pct of samples to use to train model in
+							    ## each split
+	numSplits=2L,			 ## number of train/test splits
+ 	featSelCutoff=1L,		## threshold for calling something
+							    ## feature-selected
+	featScoreMax=2L,	## max score for feature selection
+ numCores=nco,			  ## set higher for parallelizing
+ debugMode=FALSE,
+ keepAllData=FALSE,	    ## set to TRUE for debugging
+ logging="none"     ## set to "default" for messages
+  ))
+t1 <- Sys.time()
+print(t1-t0)
+```
+
+# Examine results
+
+We now use `getResults()` to fetch the model performance for the various train/test splits as well as feature scores:
+
+```{r,eval=TRUE}
+results <- getResults(
+    model,
+    unique(colData(brca)$STATUS),
+    featureSelCutoff=2L,
+    featureSelPct=0.50
+  )
+```
+
+`results` contains `performance`, `selectedFeatures` for each patient label, and the table of feature `scores`.
+
+```{r, class.source="codeblock",eval=TRUE}
+summary(results)
+```
+
+Look at the performance:
+```{r, class.source="codeblock",eval=TRUE}
+results$performance
+```
+
+Look at feature scores for all labels, across all train-test splits:
+
+```{r, class.source="codeblock", eval=TRUE}
+results$featureScores
+```
+
+Let's examine our confusion matrix:
+```{r, class.source="codeblock",eval=TRUE}
+confMat <- confusionMatrix(model)
+```
+
+*Note: Rows of this matrix don't add up to 100% because the matrix is an average of the confusion matrices from all of the train/test splits.*
+
+And here are selected features, which are those scoring 2 out of 2 in at least half of the splits. This threshold is simply for illustration. In practice we would run at least 10 train/test splits (ideally 100+), and look for features that score 7+ out of 10 in >70% splits.
+
+```{r, class.source="codeblock",eval=TRUE}
+results$selectedFeatures
+```
+
+# Validate on independent samples
+Now we use `predict()` to classify samples in the independent dataset. We provide the model with feature design rules in `groupList`, the list of selected features to use in `featSelNet`, the function to convert data into patient similarity networks in `makeNets`, as well as the original and validated datasets in `brca` and `holdout` respectively.
+
+The training data needs to be provided because netDx creates a single patient similarity network with both training and test data. It then uses label propagation to "diffuse" patient labels from training samples to test samples, and labels the latter based on which class they are most similar to.
+
+```{r,eval=TRUE}
+outDir <- paste(tempdir(), randAlphanumString(), 
+  sep = getFileSep())
+if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
+dir.create(outDir)
+
+predModel <- suppressMessages(
+  predict(trainMAE=brca, testMAE=holdout, 
+    groupList=groupList, 
+    selectedFeatures=results$selectedFeatures, 
+    sims=sims,
+    outDir=outDir, verbose = FALSE)
 )
 ```
 
-# Examine output
-The results are stored in the list object returned by the `buildPredictor()` call.
-This list contains:
+# Plot results of validation
 
-* `inputNets`: all input networks that the model started with. 
-* `Split<i>`: a list with results for each train-test split
-  * `featureScores`: feature scores for each label (list with `g` entries, where `g` is number of patient labels). Each entry contains the feature selection scores for the corresponding label.
-  * `featureSelected`: vector of features that pass feature selection. List of length `g`, with one entry per label.
-  * `predictions`: real and predicted labels for test patients
-  * `accuracy`: percent accuracy of predictions
+Finally we examine how well our model performed, using `getPerformance()`. 
 
+Compute performance:
 ```{r,eval=TRUE}
-summary(out)
-summary(out$Split1)
+perf <- getPerformance(predModel, 
+  unique(colData(brca)$STATUS))
+summary(perf)
 ```
 
-Compute accuracy for three-way classificationL
+We plot the AUROC and AUPR curves using `plotPerf_multi()`. In this example we get perfect separation of the two classes.
 
 ```{r,eval=TRUE}
-# Average accuracy
-st <- unique(colData(brca)$STATUS) 
-acc <- matrix(NA,ncol=length(st),nrow=numSplits) 
-colnames(acc) <- st 
-for (k in 1:numSplits) { 
-	pred <- out[[sprintf("Split%i",k)]][["predictions"]];
-	tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
-	                 sprintf("%s_SCORE",st))]
-	for (m in 1:length(st)) {
-	   tmp2 <- subset(tmp, STATUS==st[m])
-	   acc[k,m] <- sum(tmp2$PRED==tmp2$STATUS)/nrow(tmp2)
-	}
-}
-print(round(acc*100,2))
+plotPerf_multi(list(perf$rocCurve),
+  plotTitle = sprintf(
+    "BRCA Validation: %i samples", 
+    nrow(colData(holdout))))
+plotPerf_multi(list(perf$prCurve), 
+  plotType = "PR",
+  plotTitle = sprintf(
+    "BRCA Validation: %i samples", 
+    nrow(colData(holdout))))
+
 ```
-Also, examine the confusion matrix. We can see that the model perfectly classifies basal tumours, but performs poorly in distinguishing between the two types of luminal tumours. 
 
-```{r, eval=TRUE}
-res <- out$Split1$predictions
-print(table(res[,c("STATUS","PRED_CLASS")]))
+## Integrated patient similarity network
+We finally get the integrated PSN and visualize it using a tSNE plot:
+
+```{r, class.source="codeblock",fig.width=8,fig.height=8, eval=TRUE}
+## this call doesn't work in Rstudio; for now we've commented this out and saved the PSN file. 
+psn <- suppressMessages(getPSN(
+  brca,
+  groupList,
+  sims=sims,
+  selectedFeatures=results$selectedFeatures
+))
+```
+
+We can plot a lower dimensional representation of the patient similarities using a tSNE plot. This call requires that you install the `Rtsne` package:
+
+```{r}
+library(Rtsne)
+tsne <- tSNEPlotter(
+	psn$patientSimNetwork_unpruned, 
+	colData(brca)
+	)
 ```
 
 # sessionInfo
diff --git a/vignettes/ValidateNew.Rmd b/vignettes/ValidateNew.Rmd.old
similarity index 97%
rename from vignettes/ValidateNew.Rmd
rename to vignettes/ValidateNew.Rmd.old
index 9ada52fb..1ce8d012 100755
--- a/vignettes/ValidateNew.Rmd
+++ b/vignettes/ValidateNew.Rmd.old
@@ -176,9 +176,11 @@ if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
 dir.create(outDir)
 
 predModel <- suppressMessages(
-  predict(brca, holdout, groupList, 
-    results$selectedFeatures, makeNets,
-    outDir, verbose = FALSE)
+  predict(trainMAE=brca, testMAE=holdout, 
+    groupList=groupList, 
+    featSel=results$selectedFeatures, 
+    makeNetFunc=makeNets,
+    outDir=outDir, verbose = FALSE)
 )
 ```
 
diff --git a/vignettes/VisualizeTopScoringPathways.Rmd b/vignettes/VisualizeTopScoringPathways.Rmd
new file mode 100644
index 00000000..8fe2dce7
--- /dev/null
+++ b/vignettes/VisualizeTopScoringPathways.Rmd
@@ -0,0 +1,107 @@
+---
+title: "02. Visualize top-scoring pathways using Cytoscape"
+author: "Shraddha Pai"
+package: netDx
+date: "`r Sys.Date()`"
+output: 
+  BiocStyle::html_document:
+    toc_float: true
+vignette: >
+    %\VignetteIndexEntry{02. Visualize top-scoring pathways using Cytoscape}
+    %\VignetteEngine{knitr::rmarkdown}
+    %\VignetteEncoding{UTF-8}
+---
+
+
+In this example, we will load a trained netDx model and visualize top-scoring pathways using the popular network visualization software, [Cytoscape](cytoscape.org). 
+
+# Setup
+Let's download a pre-trained model created for binary breast tumour classification. 
+
+This model used only gene expression as input for 295 tumours classified as either "Luminal A" or not. This predictor used a *pathway-based design*, meaning that each input feature was created from a gene set representing a pathway. Pathways were compiled from a collection of curated pathway databases, including NetPath, mSigDB, Panther, Reactome, and IOB (Ref 1.). In this simple design, features are scored out of ten, over ten splits of train/test data. 
+
+Here we start by loading the model generated by running the `buildPredictor()` call:
+
+```{r,eval=TRUE}
+suppressMessages(require(netDx))
+data(model_full)
+```
+
+Let's examine the model. We can see that it has data from 10 train/test splits. 
+```{r,eval=TRUE}
+summary(model_full)
+```
+
+Let's examine the list of gene sets used to create pathway features.
+```{r,eval=TRUE}
+length(pathList)
+head(names(pathList))
+summary(pathList[1:3])
+```
+
+Now we generate results from the pathways, calling features that score 9+ out of 10 for over 70% of the train/test splits.
+```{r,eval=TRUE}
+results <- getResults(
+    model_full,
+    c("Luminal.A","other"), # patient classes
+    featureSelCutoff=9L,
+    featureSelPct=0.70,
+    drawPerformancePlot=FALSE
+)
+```
+
+We create the input for the network-based visualization. netDx automatically writes the input files to a user-provided directory, here `emapdir`. These files are available if the user wants to generate the visualization from scratch in Cytoscape.
+
+```{r,eval=TRUE}
+vizmap <- createInputForFeatureNetworkView(
+    model=model_full,
+    results=results,
+    pathwayList=pathList,
+    EMapMinScore=3L, 
+    EMapMaxScore=10L,
+    EMapPctPass=0.7,
+    outDir=normalizePath(tempdir())
+)
+```
+
+We now run the command to generate the visualization in Cytoscape. 
+
+This code block has been disabled for the vignette because Cytoscape is not installed on BioConductor build systems. Follow instructions in the next section to run it in your local environment.
+
+# Generate the feature networks in Cytoscape
+Some parts of this vignette have been commented out as they require Cytoscape to be installed with the supporting apps needed for the visualization. To see a live demo of the network-based visualization, follow these steps:
+1. [Download and install Cytoscape](https://cytoscape.org/download.html). 
+2. Open Cytoscape. Using the App Store, install the 'EnrichmentMap' and 'AutoAnnotate' apps. 
+3. In R, [install RCy3](https://bioconductor.org/packages/release/bioc/html/RCy3.html).
+4. Uncomment the call to `viewSelectedFeaturesAsNetworks()` below and run the vignette code.
+
+A Cytoscape tutorial is out of scope for this vignette but we refer you to [http://manual.cytoscape.org](http://manual.cytoscape.org/).
+
+This step will generate the following network-based view of top-scoring features.  (Note: Exact node/edge position may vary).
+
+```{r,eval=FALSE, echo=TRUE}
+viewSelectedFeaturesAsNetworks(
+ vizmap$GMTfiles[[1]],
+ vizmap$NodeStyles[[1]],
+  hideNodeLabels = TRUE,
+ groupClusters = TRUE
+ )
+```
+
+The resulting view should look like this:
+
+```{r,eval=TRUE,echo=FALSE,fig.cap="Network visualization of features after applying grid layout"}
+knitr::include_graphics("images/featureNetworkView_gridLayout.png")
+```
+
+In Cytoscape, this type of visualization is called an Enrichment Map (Ref 1). In the above view, each *node* is a **high-scoring pathway-level feature**. We specified score thresholds in the call to `createInputForFeatureNetworkView()` , specifically limiting our view to features that score 3 or higher, in at least 70% of the train/test splits (`EMapMinScore=3L`,`EMapMaxScore=10L`, `EMapPctPass=0.7`). This wide range of scores is used to illustrate how node colours reflect feature scores. In practice you would likely want to focus on high-scoring features, such as those that consistently score 7 or higher).
+
+Each *edge* **connects two pathways with shared member genes**. 
+
+The *yellow bubbles* identify clusters of related features (here, pathways) and may be interpreted as a pathway theme. The labels above the yellow bubbles are auto-generated by Cytoscape using a word-cloud based approach. While a good start, they almost always require manual refinement after examining the pathways in a cluster. 
+
+# sessionInfo()
+
+```{r,eval=TRUE}
+sessionInfo()
+```
diff --git a/vignettes/images/cytoscapeLayoutScreenshot.png b/vignettes/images/cytoscapeLayoutScreenshot.png
new file mode 100644
index 00000000..e4a4fe12
Binary files /dev/null and b/vignettes/images/cytoscapeLayoutScreenshot.png differ
diff --git a/vignettes/images/featureNetworkView_gridLayout.png b/vignettes/images/featureNetworkView_gridLayout.png
new file mode 100644
index 00000000..b08c59e6
Binary files /dev/null and b/vignettes/images/featureNetworkView_gridLayout.png differ
diff --git a/vignettes/images/vignette1_design.jpg b/vignettes/images/vignette1_design.jpg
index b6983b21..31922774 100755
Binary files a/vignettes/images/vignette1_design.jpg and b/vignettes/images/vignette1_design.jpg differ
diff --git a/vignettes/images/vignette_workflow.png b/vignettes/images/vignette_workflow.png
new file mode 100644
index 00000000..4b69ce3a
Binary files /dev/null and b/vignettes/images/vignette_workflow.png differ
diff --git a/vignettes/makeSimFunction.R b/vignettes/makeSimFunction.R
new file mode 100644
index 00000000..21d9e57b
--- /dev/null
+++ b/vignettes/makeSimFunction.R
@@ -0,0 +1,117 @@
+#' returns valid built-in similarity functions
+allowedSims <- function(){
+  c("pearson_corr","normDiff","avgNormDiff",
+        "sim.pearscale","sim.eucscale")
+}
+
+#' checks if provided similarity functions are valid. Returns error if not
+#'
+#' @param sims (list) keys are layer names, values are functions or characters 
+#' (names of built-in similarity functions)
+#' @return TRUE if all pass check. Else throws error.
+checkSimValid <- function(sims){
+    allowed <- allowedSims
+    for (k in names(sims)){
+        if (class(sims[[k]])!="function"){
+            if class(sims[[k]])!="character"{
+                stop(paste("Invalid sims datatype. ",
+                    "sims entries must be functions or keywords (characters) ",
+                    "for built-in similarity functions.",sep=""))
+            } else {
+                if (!sims[[k]] %in% allowed){
+                    stop(paste(
+                            sprintf("sims[[%s]] has invalid similarity type:",k),
+                            sims[[k]],". ",
+                            "Allowed values are: {%s}",
+                            paste(allowed,collapse=",")))
+                }
+            }
+        }
+    }
+    return(TRUE)
+}
+
+makeNetFunc <- function(dataList, groupList, netDir, sims,...){    
+    settings <- list(dataList=dataList,groupList=groupList,
+                    netDir=netDir,sims=sims)
+    netList <- c()    
+    for (nm in names(sims)){
+        csim <- sims[[nm]]
+        netList_cur <- NULL
+
+        cur_set <- settings; 
+        cur_set[["name"]] <- nm; cur_set[["similarity"]] <- csim
+
+        if (!is.null(groupList[[nm]])){
+            if (class(csim)=="function") {# custom function
+                netList_cur <- builtInPSN(cur_set,csim,...)
+            } else if (csim == "pearson_corr") {
+                netList_cur <- corrPSN(cur_set,...)
+            } else {
+                netList_cur <- builtInPSN(cur_set,...)
+            }
+            netList <- c(netList,netList_cur)
+        }
+    }
+    unlist(netList)
+}
+
+#' make PSN for built-in similarity functions
+#' @param settings (list) from makeNetFunc
+builtInPSN <- function(settings,...){
+funcs <- list(
+    "normDiff"=normDiff,
+    "avgNormDiff"=avgNormDiff,
+    "sim.pearscale"=sim.pearscale,
+    "sim.eucscale"=sim.eucscale
+)
+
+    message(sprintf("Layer %s: Function %s",settings$name,settings$similarity))
+
+    nm <- settings$name
+    netList <- makePSN_NamedMatrix(
+        settings$dataList[[nm]],
+		rownames(settings$dataList[[nm]]),
+		settings$groupList[[nm]],
+        settings$netDir,
+		simMetric="custom",
+        customFunc=funcs[[settings$similarity]], # custom function
+		writeProfiles=FALSE,
+		sparsify=TRUE,...
+    )
+    netList
+}
+
+#' make PSN for custom similarity functions
+#' @param settings (list) from makeNetFunc
+customPSN <- function(settings,fn, ...){
+    nm <- settings$name
+    message(sprintf("Layer %s: CUSTOM FUNCTION",settings$name))
+    netList <- makePSN_NamedMatrix(
+        settings$dataList[[nm]],
+		rownames(settings$dataList[[nm]]),
+		settings$groupList[[nm]],
+        settings$netDir,
+		simMetric="custom",customFunc=fn, # custom function
+		writeProfiles=FALSE,
+		sparsify=TRUE,...
+    )
+    netList
+}
+
+#' wrapper for PSNs using Pearson correlation
+#' @param settings (list) from makeNetFunc
+corrPSN <- function(settings,...){
+    message(sprintf("Layer %s: PEARSON CORR",settings$name))
+    nm <- settings$name
+    netList <- makePSN_NamedMatrix(
+				settings$dataList,
+				rownames(settings$dataList[[nm]]),	## names of measures (e.g. genes, CpGs)
+				settings$groupList[[nm]],			## how to group measures in that layer
+				settings$netDir,						## leave this as-is, netDx will figure out where this is.
+				verbose=FALSE, 			
+				writeProfiles=TRUE,   		## use Pearson correlation-based similarity
+				...
+				)
+    return(netList)
+}
\ No newline at end of file
diff --git a/vignettes/vignette_psn.png b/vignettes/vignette_psn.png
new file mode 100644
index 00000000..8be149ee
Binary files /dev/null and b/vignettes/vignette_psn.png differ
diff --git a/vignettes/vignette_workflow.png b/vignettes/vignette_workflow.png
new file mode 100644
index 00000000..4b69ce3a
Binary files /dev/null and b/vignettes/vignette_workflow.png differ