diff --git a/R/pgx-api.R b/R/pgx-api.R index c429943c..7ab77a37 100644 --- a/R/pgx-api.R +++ b/R/pgx-api.R @@ -291,6 +291,7 @@ pgx.getFamilies <- function(pgx, nmin = 10, extended = FALSE) { #' @export pgx.getTopDrugs <- function(pgx, ct, n = 10, dir = 1, na.rm = TRUE, db = 1) { + if(is.null(pgx$drugs)) return(NULL) x <- pgx$drugs[[db]]$X[, ct] q <- pgx$drugs[[db]]$Q[, ct] annot <- pgx$drugs[[db]]$annot[, ] diff --git a/R/pgx-functions.R b/R/pgx-functions.R index 8e1d365c..ed470231 100644 --- a/R/pgx-functions.R +++ b/R/pgx-functions.R @@ -1464,7 +1464,8 @@ pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) { #' @export filterProbes <- function(annot, genes) { ## check probe name, short probe name or gene name for match - p0 <- (toupper(sub(".*:", "", rownames(annot))) %in% toupper(genes)) + a0 <- mofa.strip_prefix(rownames(annot)) + p0 <- (toupper(a0) %in% toupper(genes)) p1 <- (toupper(rownames(annot)) %in% toupper(genes)) p_list <- list(p0, p1) @@ -1508,10 +1509,10 @@ rename_by2 <- function(counts, annot_table, new_id = "symbol", ##new_id="symbol";na.rm=TRUE;unique=TRUE;keep.prefix=FALSE - ## add rownames + ## add rownames and extra columns annot_table$rownames <- rownames(annot_table) annot_table$rownames2 <- sub("^[A-Za-z]+:", "", rownames(annot_table)) ## strip prefix - + if (is.matrix(counts) || inherits(counts, "Matrix") || is.data.frame(counts) || !is.null(dim(counts))) { type <- "matrix" @@ -1520,42 +1521,51 @@ rename_by2 <- function(counts, annot_table, new_id = "symbol", type <- "vector" probes <- names(counts) } - probe_match <- apply(annot_table, 2, function(x) sum(probes %in% x)) - if (max(probe_match, na.rm = TRUE) == 0) { - return(counts) - } - if( type == "vector") { - counts <- cbind(counts) - } - - from_id <- names(which.max(probe_match)) + ## handle old style annot without symbol column if(new_id=="symbol" && !"symbol" %in% colnames(annot_table) && "gene_name" %in% colnames(annot_table)) new_id <- "gene_name" - ## dummy do-noting return - if (new_id == from_id) { - sel <- which(probes %in% annot_table[,from_id]) - counts <- counts[sel, , drop=FALSE] - if(type == 'vector') counts <- counts[, 1] - return(counts) + ## strip prefix ?? + probes0 <- probes + probes <- mofa.strip_prefix(probes0) + + ## iterative matching of probes. + idx <- rep(NA, length(probes)) + names(idx) <- probes0 + probe_match <- apply(annot_table, 2, function(x) sum(probes %in% x)) + match.cols <- names(sort(probe_match,decreasing=TRUE)) + pp <- probes + for (k in match.cols) { + ##from_id <- names(which.max(probe_match)) + from <- annot_table[, k] + ii <- match(pp, from) + if(any(!is.na(ii))) { + kk <- which(!is.na(ii)) + idx[match(pp[kk],probes)] <- ii[kk] + } + pp <- probes[is.na(idx)] + if(length(pp)==0) break } - if (type == "vector") { + ## bail out if no match at all + if(all(is.na(idx))) { + return(counts) + } + + ## make sure matrix + if( type == "vector") { counts <- cbind(counts) } - - keep.prefix <- (keep.prefix && all(grepl(":", probes))) - - from <- annot_table[, from_id] - ## if (!any(duplicated(from)) || unique) { - ii <- match(probes, from) + + ## create matched counts/data table + keep.prefix <- (keep.prefix && all(grepl(":", probes0))) if (keep.prefix) { - dt <- mofa.get_prefix(probes) - new.name <- annot_table[ii, new_id] + dt <- mofa.get_prefix(probes0) + new.name <- annot_table[idx, new_id] new.name <- paste0(dt, ":", new.name) } else { - new.name <- annot_table[ii, new_id] + new.name <- annot_table[idx, new_id] } rownames(counts) <- new.name @@ -2606,3 +2616,5 @@ match.dataframe <- function(id, df, parallel=TRUE) { ## ========================================================================== ## ==================== END OF FILE ========================================= ## ========================================================================== + + diff --git a/R/pgx-mofa.R b/R/pgx-mofa.R index d1420dbd..5d0ed308 100644 --- a/R/pgx-mofa.R +++ b/R/pgx-mofa.R @@ -899,45 +899,60 @@ mofa.split_data <- function(X, keep.prefix = FALSE) { xx } +#' This merges list of multi-omics data to a single matrix. Columns +#' must match. Merge data by row after adding prefix. +#' #' @export mofa.merge_data <- function(xx) { do.call(rbind, mofa.prefix(xx)) } - #' This merges list of multi-omics data to a single matrix. Note that #' it can handle non-matched data by taking union of rownames or #' colnames and extending the final matrix. Be careful it can #' introduce NA in such non-matched cases. #' #' @export -mofa.merge_data2 <- function(xdata, prefix.rows=NULL, prefix.cols=NULL) { +mofa.merge_data2 <- function(xdata, merge.rows="prefix", merge.cols="union") { n1 <- length(Reduce(intersect,lapply(xdata,rownames))) n2 <- length(Reduce(intersect,lapply(xdata,colnames))) - c(n1,n2) - if(n1 && n2) { - message("WARNING: matrices are overlapping both in rows and columns") + rdim <- sapply(xdata,nrow) + cdim <- sapply(xdata,ncol) + if(n1 < min(rdim) && merge.rows!="prefix") { + message("WARNING: rows do not match") } - if(is.null(prefix.cols)) prefix.cols <- (n1 > 0 && n2 > 0) - if(is.null(prefix.rows)) prefix.rows <- (n1 > 0 && n2 > 0) + if(n2 < min(cdim) && merge.cols!="prefix") { + message("WARNING: columns do not match") + } + prefix.rows <- (merge.rows=="prefix") + prefix.cols <- (merge.cols=="prefix") if(prefix.cols) { - ## if rows overlap (i.e. same genes), prefix the column names - ## (i.e. different datasets) + ## prefix the column names. i.e. different datasets. for(i in 1:length(xdata)) { - nn <- sub("[A-Za-z]+:","",colnames(xdata[[i]])) + nn <- sub("^[A-Za-z]+:","",colnames(xdata[[i]])) colnames(xdata[[i]]) <- paste0(names(xdata)[i],":",nn) } + merge.cols <- "union" } if(prefix.rows) { ## if columns overlap (i.e. same samples), prefix the feature ## names. for(i in 1:length(xdata)) { - nn <- sub("[A-Za-z]+:","",rownames(xdata[[i]])) + nn <- sub("^[A-Za-z]+:","",rownames(xdata[[i]])) rownames(xdata[[i]]) <- paste0(names(xdata)[i],":",nn) } + merge.rows <- "union" + } + if(merge.rows == "intersect") { + allfeatures <- Reduce(intersect,lapply(xdata,rownames)) + } else { + allfeatures <- unique(unlist(lapply(xdata, rownames))) + } + if(merge.cols == "intersect") { + allsamples <- Reduce(intersect,lapply(xdata,colnames)) + } else { + allsamples <- unique(unlist(lapply(xdata, colnames))) } - allfeatures <- unique(unlist(lapply(xdata, rownames))) - allsamples <- unique(unlist(lapply(xdata, colnames))) D <- matrix(0, length(allfeatures), length(allsamples)) nn <- matrix(0, length(allfeatures), length(allsamples)) rownames(D) <- allfeatures @@ -987,11 +1002,11 @@ mofa.get_prefix <- function(x) { #' @export mofa.strip_prefix <- function(xx) { if (class(xx) == "character") { - xx <- sub("[A-Za-z0-9]+:", "", xx) + xx <- sub("^[A-Za-z0-9]+:", "", xx) return(xx) } if (class(xx) == "matrix") { - rownames(xx) <- sub("[A-Za-z0-9]+:", "", rownames(xx)) + rownames(xx) <- sub("^[A-Za-z0-9]+:", "", rownames(xx)) return(xx) } if (class(xx) %in% c("list", "array") || is.list(xx)) { @@ -2652,6 +2667,13 @@ lasagna.create_model <- function(data, pheno="pheno", ntop=1000, nc=20, add.sink=FALSE, intra=TRUE, fully_connect=FALSE, add.revpheno = TRUE, condition.edges=TRUE ) { + if(0) { + pheno="pheno"; ntop=1000; nc=20; + annot=NULL; use.gmt=TRUE; use.graphite=TRUE; + add.sink=FALSE; intra=TRUE; fully_connect=FALSE; + add.revpheno = TRUE; condition.edges=TRUE + } + if (pheno == "pheno") { Y <- expandPhenoMatrix(data$samples, drop.ref=FALSE) } else if (pheno == "expanded") { @@ -2686,7 +2708,8 @@ lasagna.create_model <- function(data, pheno="pheno", ntop=1000, nc=20, } ## what about not overlapping samples?? - X <- mofa.merge_data2(xx, prefix.rows=TRUE, prefix.cols=FALSE) + #X <- mofa.merge_data(xx) + X <- mofa.merge_data2(xx, merge.rows="prefix", merge.cols="union") ##remove(xx) kk <- intersect(colnames(X),rownames(Y)) X <- X[,kk] @@ -2700,7 +2723,7 @@ lasagna.create_model <- function(data, pheno="pheno", ntop=1000, nc=20, ## Compute BIG correlation matrix. WARNING can become huge! NOTE: ## Needs optimization using SPARSE matrix. suppressWarnings( R <- cor(t(X), use = "pairwise") ) - + ## Sink/source need to be connected allways ii <- grep("SINK|SOURCE",rownames(R)) if(length(ii)) { @@ -2763,6 +2786,7 @@ lasagna.create_model <- function(data, pheno="pheno", ntop=1000, nc=20, ## mask for GSETS/pathways connections??? if (use.gmt) { + ### fill me } ## define layers @@ -3012,7 +3036,8 @@ lasagna.plot3D <- function(graph, pos) { #' @export lasagna.prune_graph <- function(graph, ntop = 100, layers = NULL, normalize.edges = FALSE, min.rho = 0.3, - edge.sign = "both", edge.type = "both", + edge.sign = c("both","pos","neg","consensus")[1], + edge.type = c("both","inter","intra","both2")[1], filter = NULL, prune = TRUE) { @@ -3565,10 +3590,6 @@ mofa.normalizeExpression <- function(X, method1="maxMedian", method2="none") { return(normX) } - - - - ## ====================================================================== ## ====================================================================== ## ====================================================================== diff --git a/R/pgx-wgcna.R b/R/pgx-wgcna.R index 5ff7d601..dc0f5047 100644 --- a/R/pgx-wgcna.R +++ b/R/pgx-wgcna.R @@ -130,7 +130,7 @@ pgx.wgcna <- function( ## We augment pgx$GMT with original GSETxGENE to get better results ## for all modules. - GMT0 <- Matrix::t(playdata::GSETxGENE) + GMT0 <- getPlaydataGMT() GMT0 <- rename_by2(GMT0, pgx$genes, "symbol") GMT <- merge_sparse_matrix(pgx$GMT, GMT0) @@ -138,7 +138,6 @@ pgx.wgcna <- function( wgcna, annot = pgx$genes, GMT = GMT, - # gsetX = pgx$gsetX, methods = c("fisher", "gsetcor", "xcor"), ntop = 1000, xtop = 100, @@ -152,7 +151,7 @@ pgx.wgcna <- function( message("Annotating modules using ", ai_model) ai <- wgcna.describeModules( wgcna, - ntop = 25, + ntop = 50, model = ai_model, annot = pgx$genes, multi = FALSE, @@ -173,6 +172,12 @@ pgx.wgcna <- function( return(wgcna) } +getPlaydataGMT <- function() { + G1 <- Matrix::t(playdata::GSETxGENE) + G2 <- Matrix::t(playdata::MSETxMETABOLITE) + rownames(G2) <- sub(".*:","",rownames(G2)) + merge_sparse_matrix(G1, G2) +} #' @export wgcna.compute <- function(X, @@ -330,6 +335,7 @@ wgcna.compute <- function(X, ctx <- makeContrastsFromLabelMatrix(contrasts) ctx <- sign(ctx) ctx[ctx==0] <- NA + ctx[ctx==-1] <- 0 datTraits <- cbind( datTraits, ctx) } @@ -455,10 +461,12 @@ wgcna.compute_multiomics <- function(dataX, minmodsize = 10, minKME = 0.3, deepsplit = 2, + mergeCutHeight = 0.15, compute.enrichment = TRUE, + #xref = c("gx","px"), + xref = NULL, annot = NULL, GMT = NULL, - gsetX = NULL, drop.ref = FALSE, add.pheno = FALSE, add.gsets = FALSE, @@ -474,19 +482,31 @@ wgcna.compute_multiomics <- function(dataX, ) { if(0) { + power=6 + ngenes = 2000; do.consensus = 1 - cutMethod = "hybrid" - deepsplit = 2 - power = 12 - ngenes = 2000 - minmodsize = 10 - minKME = 0.3 - compute.enrichment = TRUE - gset.ntop = 1000 - gset.xtop = 100 - annot = pgx$genes - GMT = pgx$GMT ##?? - gsetX = pgx$gsetX + clustMethod = "average" + cutMethod = "hybrid" + clustMethod = "average"; + cutMethod = "hybrid"; + minmodsize = 10; + minKME = 0.3; + deepsplit = 2; + compute.enrichment = TRUE; + xref = c("gx","px"); + annot = NULL; + GMT = NULL; + drop.ref = FALSE; + add.pheno = FALSE; + add.gsets = FALSE; + do.consensus = FALSE; + gset.methods = c("fisher","gsetcor","xcor"); + gset.ntop = 1000; + gset.xtop = 100; + summary = TRUE; + ai_model = DEFAULT_LLM; + ai_experiment = ""; + verbose = 1; progress = NULL } @@ -495,16 +515,30 @@ wgcna.compute_multiomics <- function(dataX, dataX <- lapply(dataX, function(x) rename_by2(x, annot, "symbol")) } + if(!is.null(annot) && !is.null(GMT)) { + GMT <- rename_by2(GMT, annot, "symbol") + } + + if(add.gsets || compute.enrichment ) { + ## augment geneset matrix with original GSETxGENE + GMT0 <- getPlaydataGMT() + if(!is.null(annot)) GMT0 <- rename_by2(GMT0, annot, "symbol") + if(!is.null(GMT)) { + GMT <- merge_sparse_matrix(GMT, GMT0) + } else { + GMT <- GMT0 + } + } + ## add pheno matrix?? if(add.gsets) { - if(is.null(GMT)) GMT <- Matrix::t(playdata::GSETxGENE) - if(is.null(gsetX)) { - X <- do.call(rbind, dataX) - if(!is.null(annot)) GMT <- rename_by2(GMT, annot, "symbol") - kk <- intersect(rownames(X), rownames(GMT)) - if(length(kk)) gsetX <- plaid::plaid(X[kk,], GMT[kk,]) + X <- mofa.merge_data2(dataX, merge.rows="union") + if(!is.null(annot)) X <- rename_by2(X, annot, "symbol") + kk <- intersect(rownames(X), rownames(GMT)) + if(length(kk)) { + gsetX <- plaid::plaid(X[kk,], GMT[kk,]) + dataX$gs <- gsetX } - if(!is.null(gsetX)) dataX$gs <- gsetX } ## add phenomatrix?? @@ -517,35 +551,41 @@ wgcna.compute_multiomics <- function(dataX, if (any(dt.na)) { dataX[dt.na] <- lapply(dataX[dt.na], imputeMissing, method="SVD2") } - names(dataX) <- substring(names(dataX),1,2) + names(dataX) <- substring(names(dataX),1,2) ##??? #dataX <- mofa.topSD(dataX, ngenes) if(!is.null(progress)) { progress$set(message = paste("computing WGCNA modules..."), value = 0.33) } - if(is.null(power) || is.na(power) ) power <- "sft" - if(as.character(power[1]) %in% c("sft","iqr")) { - message("[wgcna.compute_multiomics] estimating power with method = ", power[1]) - est.power <- rep(NA, length(dataX)) + if(is.null(power)) power <- NA + nw <- length(dataX) + if(length(power) 0) { contlm <- apply(contY, 2, function(y) { - rho <- cor(datExpr, y, use="pairwise")[,1] - P.Value <- WGCNA::corPvalueStudent(rho, n = length(y)) + tt <- cortest(datExpr, y) + rho <- tt$rho[,1] + P.Value <- tt$pvalue[,1] data.frame(rho, P.Value) }) contlm <- contlm[!sapply(contlm, is.null)] @@ -1230,126 +1273,55 @@ wgcna.getGeneStats <- function(wgcna, trait, module=NULL, plot = TRUE, df } -#' Compute gene statistics with original datExpr but with consensus -#' colors/labels for each layers. A separate function -#' wgcna.getConsensusGeneStats() extracts clean tables from this -#' results object. -#' -#' @export -wgcna.computeConsensusGeneStats <- function(cons) { - k <- names(cons$layers)[1] - stats <- list() - for(k in names(cons$layers)) { - w <- cons$layers[[k]] - colors <- cons$net$colors - wMEs <- cons$net$multiMEs[[k]]$data - wnet <- list( MEs = wMEs, colors = colors) - stats[[k]] <- wgcna.computeGeneStats( - wnet, w$datExpr, w$datTraits, TOM=NULL) - } - return(stats) -} - -#' -#' -#' @export -wgcna.getConsensusGeneStats <- function(cons, stats, trait, module=NULL) { - ## create extended color vector - labels = paste0("ME",cons$net$colors) - gstats <- list() - for(k in names(stats)) { - gstats[[k]] <- wgcna.getGeneStats( - wgcna = NULL, - stats = stats[[k]], - labels = labels, - trait = trait, - plot = FALSE, - module = module, - col = NULL, - main = NULL - ) - } +## ---------------------------------------------------- +## Perform geneset analysis on modules +## ---------------------------------------------------- - ## Align rows - ff <- gstats[[1]]$feature - for(k in names(gstats)) { - ii <- match(ff, gstats[[k]]$feature) - gstats[[k]] <- gstats[[k]][ii,] +#' Merge multiple ME matrices into one. Allow different dimensions. +#' +wgcna.mergeME <- function(mlist, me2=NULL, prefix=FALSE) { + if(!is.null(me2) && !inherits(mlist,"list")) { + mlist <- list(mlist, me2) + } + all.samples <- sapply(mlist, rownames, simplify=FALSE) + all.samples <- unique(unlist(all.samples)) + if(prefix) { + for(i in 1:length(mlist)) { + colnames(mlist[[i]]) <- paste0(names(mlist)[i],":",colnames(mlist[[i]])) + } } - - ## Compute consensus statistics. Consensus statistics are computed - ## as geometric mean of score variables, and/or maximum pvalue for - ## p.value columns. - xcols <- c(3,4,6,8) - pcols <- c(10,5,7,9) - pcols1 <- c(5,7,9) - xcols <- c("score","moduleMembership","traitSignificance","foldChange") - pcols <- c("scorePvalue","MMPvalue","TSPvalue","foldChangePvalue") - pcols1 <- pcols[-1] - for(i in 1:length(gstats)) { - gstats[[i]][,'scorePvalue'] <- apply(gstats[[i]][,pcols1],1,max,na.rm=TRUE) + is.mat <- all(sapply(mlist, inherits, what="matrix")) + all.me <- sapply(mlist, colnames, simplify=FALSE) + all.me <- unique(unlist(all.me)) + M <- matrix(NA, nrow=length(all.samples), ncol=length(all.me)) + M <- as.data.frame(M) + rownames(M) <- all.samples + colnames(M) <- all.me + i=1 + for(i in 1:length(mlist)) { + ii <- match(rownames(mlist[[i]]), rownames(M)) + jj <- match(colnames(mlist[[i]]), colnames(M)) + M[ii,jj] <- mlist[[i]] } - ##xc <- lapply(gstats, function(x) log(abs(x[,xcols])*(x[,pcols]<0.05))) - xc <- lapply(gstats, function(x) log(abs(x[,xcols]))) - xc <- exp(Reduce('+', xc) / length(xc)) - xp <- Reduce(pmax, lapply(gstats, function(x) x[,pcols])) - df3 <- data.frame( gstats[[1]][,1:2], xc, xp) - df3 <- df3[,colnames(gstats[[1]])] - head(df3) - - ## Determine consensus status. Feature is 'C' (concordant) if sign - ## in all layers are equal and significant. 'D' (discordant) if sign - ## if not equal in all layers but significant. 'N' is any is - ## non-significant. - sign.pos <- Reduce('*',lapply(gstats,function(g) sign(g$score) == 1)) - sign.neg <- Reduce('*',lapply(gstats,function(g) sign(g$score) == -1)) - allsig <- Reduce('*',lapply(gstats,function(g) (g$scorePvalue) < 0.05)) - table(allsig) - consensus <- c("D","C")[ 1 + 1*(sign.pos | sign.neg)] - consensus[which(allsig==0)] <- 'N' - cons.df <- data.frame(df3[,1:2], consensus, df3[,-c(1,2)]) - head(cons.df) - - ## This creates the full stats matrix (all subgroups) - df1 <- gstats[[1]][,c("feature","module")] - df2 <- gstats[[1]][,0] - cols <- colnames(gstats[[1]])[-c(1:2)] - for(k in cols ) { - xx <- sapply(gstats, function(g) g[,k]) - df2[[k]] <- I(xx) - } - df2 <- do.call(cbind, lapply(df2,unclass)) - newcols <- unlist(lapply(cols, function(k) paste0(k,'.',names(gstats)))) - colnames(df2) <- newcols - full.df <- data.frame(df1, consensus=cons.df$consensus, df2) - - - ## sort?? - ii <- order(-cons.df$score * sign(mean(cons.df$score,na.rm=TRUE))) - cons.df <- cons.df[ii,] - full.df <- full.df[ii,] - - list( - consensus = cons.df, - full = full.df - ) + if(is.mat) M <- as.matrix(M) + return(M) } - -## ---------------------------------------------------- -## Perform geneset analysis on modules -## ---------------------------------------------------- - +#' Compute enrichment of each WGCNA module using various +#' methods. Handles single-type and multi-omics WGCNA objects. +#' #' @export wgcna.computeModuleEnrichment <- function(wgcna, + GMT, multi = FALSE, methods = c("fisher","gsetcor","xcor"), ntop = 200, xtop = 100, annot = NULL, - GMT = NULL, - gsetX = NULL, + xref = NULL, + min.genes = 3, + min.rho = 0.8, filter = NULL ) { @@ -1357,317 +1329,164 @@ wgcna.computeModuleEnrichment <- function(wgcna, wgcna <- list(gx = wgcna) } - if(!any( c("gx","px") %in% names(wgcna))) { - message("ERROR: datasets must have gx or px datatype!") + if(is.null(GMT)) { + message("ERROR: must provide GMT") return(NULL) } - - ## collapse features to symbol - selx <- intersect(c("gx","px"), names(wgcna))[1] - geneX <- t(as.matrix(wgcna[[selx]]$datExpr)) - symbol.col <- NULL - + + ## create fake annotation table if no user annotation table is + ## given. if(is.null(annot)) { gg <- lapply(wgcna, function(w) colnames(w$datExpr)) - gg1 <- lapply(names(wgcna), function(a) paste0(a,":",gg[[a]])) - gg <- as.character(unlist(gg)) - gg1 <- as.character(unlist(gg1)) - annot <- data.frame(feature = gg1, symbol = gg) - } - - if(is.null(GMT)) { - message("[wgcna.computeModuleEnrichment] Using playdata GSETxGENE genesets") - GMT <- Matrix::t(playdata::GSETxGENE) + ff <- list() + for(i in 1:length(gg)) ff[[i]] <- paste0(names(wgcna)[i],":",gg[[i]]) + gg <- unlist(gg) + ff <- unlist(ff) + annot <- data.frame(feature = ff, symbol = gg) + rownames(annot) <- NULL } - if(!is.null(annot)) { - symbol.col <- intersect(c("symbol","gene_name"),colnames(annot))[1] - geneX <- rename_by2(geneX, annot, symbol.col) - GMT <- rename_by2( GMT, annot, symbol.col) - } - if(length(intersect(rownames(geneX),rownames(GMT)))==0) { - message("[wgcna.computeModuleEnrichment] ERROR: no overlap for geneX and GMT features. Please add annotation.") - return(NULL) - } + ## make sure GMT features are in symbols + symbol.col <- intersect(c("symbol","gene_name"),colnames(annot))[1] + GMT <- rename_by2(GMT, annot, symbol.col) - if(is.null(gsetX)) { - message("[wgcna.computeModuleEnrichment] computing gsetX using PLAID") - gsetX <- plaid::plaid( geneX, GMT) + ## add cross-referencing data?? + xref <- intersect(xref, names(wgcna)) + if(length(xref)) { + message("[wgcna.computeModuleEnrichment] NOTE. Adding cross-correlation datatype: ", xref) } + + gsea <- list() + dtype = names(wgcna)[1] + for(dtype in names(wgcna)) { + + ## collapse features to symbol + sel <- unique(c(dtype, xref)) + datExpr <- lapply(wgcna[sel], function(w) t(as.matrix(w$datExpr))) + geneX <- mofa.merge_data2(datExpr, merge.rows="prefix", merge.cols="union") + geneX <- rename_by2(geneX, annot, symbol.col) - bg <- rownames(geneX) - bg <- intersect(bg, rownames(GMT)) - if (length(bg) == 0) { - message("[wgcna.computeModuleEnrichment] FATAL. no overlapping genes") - return(NULL) - } - G1 <- GMT[bg, , drop = FALSE] - if (!is.null(filter)) { - sel <- grep(filter, colnames(G1)) - if (length(sel)) G1 <- G1[, sel, drop = FALSE] - } - G1 <- G1[, which(Matrix::colSums(G1 != 0) >= 4), drop = FALSE] - - ## align dimensions - ss <- intersect(rownames(gsetX), colnames(G1)) - G1 <- G1[,ss,drop=FALSE] - gsetX <- gsetX[ss,] - - ## get eigengene members - me.genes <- lapply(wgcna, function(w) w$me.genes) - names(me.genes) <- NULL - me.genes <- unlist(me.genes,recursive=FALSE) - me.genes <- lapply(me.genes, function(g) probe2symbol(g, annot, query = symbol.col)) - - ## compute most correlated gx/px genes. limit xtop if geneX is too - ## small - ME <- lapply(wgcna, function(w) as.matrix(w$net$MEs)) - xtop <- min(xtop, round(nrow(geneX)/4)) - nbx.genes <- list() - for(i in 1:length(wgcna)) { - nbx.cor <- cor(t(geneX), ME[[i]]) - for(k in colnames(nbx.cor)) { - if(is.null(xtop)) { - n <- length(wgcna[[i]]$me.genes[[k]]) - } else { - n <- xtop - } - nbx.genes[[k]] <- head(names(sort(-nbx.cor[,k])), n) + ## check if overlap exists + bg <- intersect(rownames(geneX), rownames(GMT)) + if (length(bg) == 0) { + message("[wgcna.computeModuleEnrichment] WARNING. no overlapping genes for ", dtype) + next() } - } + G1 <- GMT[bg, , drop = FALSE] + if (!is.null(filter)) { + sel <- grep(filter, colnames(G1)) + if (length(sel)) G1 <- G1[, sel, drop = FALSE] + } + G1 <- G1[, which(Matrix::colSums(G1 != 0) >= min.genes), drop = FALSE] - ## add nearest expression neighbors to module genes - nbx.genes <- nbx.genes[names(me.genes)] - for(k in names(me.genes)) { - me.genes[[k]] <- unique(c(me.genes[[k]], nbx.genes[[k]])) + if(nrow(G1)>=3 && ncol(G1)>=3 ) { + ## get eigengene members. convert to symbols. + me.genes <- wgcna[[dtype]]$me.genes + me.genes <- lapply(me.genes, function(g) probe2symbol(g, annot, query = symbol.col)) + + ## Get eigengene matrix + ME <- as.matrix(wgcna[[dtype]]$net$MEs) + + dt.gsea <- wgcna.run_enrichment_methods( + ME = ME, + me.genes = me.genes, + GMT = G1, + geneX = geneX, + methods = methods, + ntop = ntop, + xtop = xtop, + min.rho = min.rho + ) + + ## add to results + gsea <- c(gsea, dt.gsea) + } } - - ## make single ME matrix - MEx <- do.call(cbind, ME) - gsea <- wgcna.run_enrichment_methods( - ME = MEx, - me.genes = me.genes, - G = G1, - geneX = geneX, - gsetX = gsetX, - methods = methods, - ntop = ntop - ) - return(gsea) } -#' -#' -#' @export -wgcna.getModuleCrossGenes <- function(wgcna, ref=NULL, ngenes = 100, - multi=TRUE, modules=NULL) -{ - if(!multi) { - wgcna <- list(gx = wgcna) - ref <- 'gx' - } - - if(is.null(ref)) ref <- head(intersect(names(wgcna),c("gx","px")),1) - if(is.null(ref) || !ref %in% names(wgcna)) ref <- names(wgcna)[1] +wgcna.run_enrichment_methods <- function(ME, me.genes, GMT, geneX, + methods = c("fisher","gsetcor","xcor"), + ntop = 400, xtop=100, min.genes = 3, + min.rho = 0.8) +{ + rho.list <- list() + pval.list <- list() - W <- wgcna[[ref]] - geneX <- W$datExpr + ## align matrices + bg <- intersect(rownames(GMT), rownames(geneX)) + GMT <- GMT[bg,] + geneX <- geneX[bg,] - MEx <- sapply(wgcna, function(w) as.matrix(w$net$MEs)) - MEx <- do.call(cbind, MEx) + ## select on minimum genes + sel <- which( Matrix::colSums(GMT!=0) >= min.genes ) + GMT <- GMT[,sel] + + ## create gsetX + gsetX <- plaid::plaid(geneX, matG=GMT) + message("Computing enrichment for ", nrow(gsetX), " genesets") + + ## Add highly cross-correlated genes. limit xtop if geneX is too + ## small. + if(xtop > 0) { + message("Adding high cross-correlated features. min.rho = ", min.rho) + xtop <- min(xtop, round(nrow(geneX)/4)) + nbx.genes <- list() + for(k in colnames(ME)) { + ss <- intersect(colnames(geneX),rownames(ME)) + if(length(ss)<3) next() + gx <- geneX[,ss,drop=FALSE] + mx <- ME[ss,k,drop=FALSE] + cx <- cor(t(gx), mx, use="pairwise")[,1] + cx <- cx[!is.na(cx) & abs(cx) > min.rho] + if(length(cx)) { + nbx.genes[[k]] <- head(names(sort(-cx)), xtop) + } else { + nbx.genes[[k]] <- c() + } + } - if(!is.null(modules)) { - modules <- intersect(modules, colnames(MEx)) - MEx <- MEx[,modules,drop=FALSE] + ## add to ME genes + for(k in names(me.genes)) { + me.genes[[k]] <- unique(c(me.genes[[k]], nbx.genes[[k]])) + } } - - nbx.cor <- cor(geneX, MEx) - nbx.list <- list() - for(k in colnames(nbx.cor)) { - ii <- head(order(-nbx.cor[,k]), ngenes) - rho <- nbx.cor[ii,k] - gene <- rownames(nbx.cor)[ii] - me <- W$net$labels[gene] - nbx.list[[k]] <- data.frame( gene = gene, rho = rho, module = me) - } - - ##if(length(nbx.list)==1) nbx.list <- nbx.list[[1]] - return(nbx.list) -} - - -#' Compute consensus enrichment by calculating overlapping enriched -#' terms. -#' -wgcna.computeConsensusModuleEnrichment <- function(cons, - GMT, - annot, - methods = c("fisher","gsetcor","xcor"), - min.genes = 10, - ntop = 400 ) -{ - if(0) { - methods = c("fisher","gsetcor","xcor") - min.genes = 10 - ntop = 400 - annot = NULL - GMT <- Matrix::t(playdata::GSETxGENE) - } - - gseaX <- list() - i=1 - for(i in 1:length(cons$datExpr)) { - - geneX <- t(cons$datExpr[[i]]) - dim(geneX) - - ## Rename everything to symbols - if(!is.null(annot)) { - geneX <- rename_by2(geneX, annot, "symbol") - GMT <- rename_by2(GMT, annot, "symbol") - } - ng <- length(intersect(rownames(geneX), rownames(GMT))) - if(ng == 0) { - message("[wgcna.computeConsensusModuleEnrichment] ERROR. No symbol overlap.") - return(NULL) - } - symbols <- intersect(rownames(GMT),rownames(geneX)) - message("[wgcna.computeConsensusModuleEnrichment] number of symbols: ", length(symbols)) - geneX <- geneX[symbols,] - GMT <- GMT[symbols,] - - ## select on minimum gene sets size - sel <- which( Matrix::colSums(GMT!=0) >= min.genes ) - GMT <- GMT[,sel] - - ## Compute single-samples gene set expression - gsetX <- plaid::plaid(geneX, matG=GMT) - dim(gsetX) - - ## Create extended Eigengene matrix (ME). ME should be nicely - ## normalized/scaled so we just rbind across datasets - ME <- cons$net$multiMEs[[i]]$data - dim(ME) - - ## get genes in modules - me.genes <- tapply(names(cons$net$colors), cons$net$colors, list) - names(me.genes) <- paste0("ME",names(me.genes)) - if(!is.null(annot)) { - me.genes <- lapply(me.genes, function(gg) probe2symbol(gg, annot)) - } - me.genes <- lapply(me.genes, function(g) intersect(g, symbols)) - rownames(ME) - colnames(geneX) <- rownames(ME) - colnames(gsetX) <- rownames(ME) - - k <- names(cons$datExpr)[i] - gseaX[[k]] <- wgcna.run_enrichment_methods( - ME, - me.genes = me.genes, - GMT= GMT, - geneX = geneX, - gsetX = gsetX, - methods = methods, - min.genes = min.genes, - ntop = ntop) - } - - cons.gsea <- list() - m=1 - for(m in names(gseaX[[1]])) { - xx <- lapply( gseaX, function(g) g[[m]] ) - sel <- Reduce(intersect, lapply(xx, rownames)) - if(length(sel) > 0) { - if(length(sel)==1) sel <- c(sel,sel) ## length==1 crashes... - xx <- lapply(xx, function(x) x[sel,,drop=FALSE] ) - xx.score <- sapply(xx, function(x) x[,"score"]) - colnames(xx.score) <- paste0("score.",colnames(xx.score)) - - xx.pvalue <- lapply(xx, function(x) x[,grep("^p",colnames(x))]) - xx.pvalue <- do.call(cbind, xx.pvalue) - - m.score <- rowMeans(xx.score,na.rm=TRUE) - m.pvalue <- apply(sapply(xx, function(x) x[,"p.value"]), 1, max, na.rm=TRUE) - m.qvalue <- p.adjust( m.pvalue ) - df <- data.frame( - module = xx[[1]]$module, - geneset = xx[[1]]$geneset, - score = m.score, - xx.score, - p.value = m.pvalue, - q.value = m.qvalue, - overlap = xx[[1]]$overlap, - genes = xx[[1]]$genes, - xx.pvalue - ) - df <- df[order(df$p.value),] - #df <- df[!duplicated(df$geneset),,drop=FALSE] - cons.gsea[[m]] <- df - } - } - - return(cons.gsea) -} - - -wgcna.run_enrichment_methods <- function(ME, me.genes, GMT, geneX, gsetX, - methods = c("fisher","gsetcor","xcor"), - ntop = 400, min.genes = 3 - ) -{ - - rho.list <- list() - pval.list <- list() - - ## align matrices - bg <- intersect(rownames(GMT), rownames(geneX)) - ss <- intersect(colnames(GMT), rownames(gsetX)) - GMT <- GMT[bg,ss] - geneX <- geneX[bg,] - gsetX <- gsetX[ss,] - - ## select on minimum genes - sel <- which( Matrix::colSums(GMT!=0) >= min.genes ) - GMT <- GMT[,ss] - gsetX <- gsetX[ss,] - - message("Computing enrichment for ", length(ss), " genesets") - - ## Here we correlate geneset score (averageCLR) with the module - ## eigengene (ME). This should select genesets correlated with the - ## ME. - if ("gsetcor" %in% methods && !is.null(gsetX)) { - message("[wgcna.run_enrichment_methods] calculating single-sample geneset correlation...") - rc.rho <- matrix(NA, ncol(GMT), ncol(ME)) - rc.pvalue <- matrix(NA, ncol(GMT), ncol(ME)) - dimnames(rc.rho) <- list( colnames(GMT), colnames(ME)) - dimnames(rc.pvalue) <- list( colnames(GMT), colnames(ME)) - jj <- which(rownames(gsetX) %in% colnames(GMT)) - kk <- intersect(colnames(gsetX), rownames(ME)) ## common samples - rho.jj <- cor(t(gsetX[jj,kk]), ME[kk,], use = "pairwise") - #rc.pvalue <- cor.pvalue(rc.rho, n = length(kk)) - pvalue.jj <- WGCNA::corPvalueStudent(rho.jj, n = length(kk)) - ii <- match(rownames(gsetX)[jj], rownames(rc.rho)) - rc.rho[ii,] <- rho.jj - rc.pvalue[ii,] <- pvalue.jj - rho.list[["gsetcor"]] <- rc.rho - pval.list[["gsetcor"]] <- rc.pvalue + ## Here we correlate geneset score (averageCLR) directly with the + ## module eigengene (ME). This should select genesets correlated + ## with the ME. + if ("gsetcor" %in% methods) { + message("[wgcna.run_enrichment_methods] calculating single-sample geneset correlation...") + rc.rho <- matrix(NA, ncol(GMT), ncol(ME)) + rc.pvalue <- matrix(NA, ncol(GMT), ncol(ME)) + dimnames(rc.rho) <- list( colnames(GMT), colnames(ME)) + dimnames(rc.pvalue) <- list( colnames(GMT), colnames(ME)) + jj <- which(rownames(gsetX) %in% colnames(GMT)) + kk <- intersect(colnames(gsetX), rownames(ME)) ## common samples + tt <- cortest(t(gsetX[jj,kk]), ME[kk,]) + rho.jj <- tt$rho + pvalue.jj <- tt$pvalue + ii <- match(rownames(gsetX)[jj], rownames(rc.rho)) + rc.rho[ii,] <- rho.jj + rc.pvalue[ii,] <- pvalue.jj + rho.list[["gsetcor"]] <- rc.rho + pval.list[["gsetcor"]] <- rc.pvalue } ## Here we correlate the module eigengene (ME) with genes and then ## do a gset.rankcor() on the ME correlation. if ("xcor" %in% methods) { - message("[wgcna.run_enrichment_methods] calculating eigengene GBA correlation...") - gba <- cor(t(geneX), ME, use = "pairwise") - rc <- gset.rankcor(gba, GMT, compute.p = TRUE) ## NEEDS CHECK!!! + message("[wgcna.run_enrichment_methods] calculating eigengene correlation...") + ss <- intersect(colnames(geneX),rownames(ME)) + rho <- cor(t(geneX[,ss,drop=FALSE]), ME[ss,,drop=FALSE], use="pairwise") + rho[is.na(rho)] <- 0 + rc <- gset.rankcor(rho, GMT, compute.p = TRUE) ## NEEDS CHECK!!! rho.list[["xcor"]] <- rc$rho pval.list[["xcor"]] <- rc$p.value } - + gmt <- mat2gmt(GMT) if(1) { ## we pre-select to make this faster @@ -1678,10 +1497,13 @@ wgcna.run_enrichment_methods <- function(ME, me.genes, GMT, geneX, gsetX, gmt <- gmt[sel] } - ## fGSEA + ## fGSEA. Compute gene correlation to eigengenes ME, then do + ## pre-ranked enrichment on rho value. if ("fgsea" %in% methods) { message("[wgcna.run_enrichment_methods] calculating module fgsea...") - xrho <- cor(t(geneX), ME, use = "pairwise") + ss <- intersect(colnames(geneX),rownames(ME)) + xrho <- cor(t(geneX[,ss,drop=FALSE]), ME[ss,,drop=FALSE], use = "pairwise") + xrho[is.na(xrho)] <- 0 res <- list() i=1 for(i in 1:ncol(xrho)) { @@ -1698,7 +1520,8 @@ wgcna.run_enrichment_methods <- function(ME, me.genes, GMT, geneX, gsetX, pval.list[["fgsea"]] <- pval } - ## Perform fisher-test on ME genes + ## Perform fisher-test on (extended) ME genes. The ME genes might + ## have been extended with most correlated genes. if ("fisher" %in% methods) { message("[wgcna.run_enrichment_methods] calculating Fisher tests...") @@ -1839,6 +1662,95 @@ wgcna.merge_block_dendrograms <- function(net, X, method = 1) { merged_hclust } +#' @export +wgcna.matchColors <- function(wgcna, refcolors) { + + oldcolors <- wgcna$net$colors + newcolors <- WGCNA::matchLabels(oldcolors, refcolors) + lut <- table(oldcolors, newcolors) + old2new <- colnames(lut)[max.col(lut)] + names(old2new) <- rownames(lut) + prefix <- substring(names(wgcna$me.colors),1,2)[1] + old2newME <- paste0(prefix,old2new) + names(old2newME) <- paste0(prefix,names(old2new)) + old2new <- c(old2new, old2newME) + + newcol <- function(x) { + array( old2new[x], dimnames = list(names(x))) + } + + ## rename everything in net object + wgcna$net$colors <- newcol(wgcna$net$colors) + if("labels" %in% names(wgcna$net)) { + wgcna$net$labels <- newcol(wgcna$net$labels) + } + + ## rename unmergedColors + if("unmergedColors" %in% names(wgcna$net)) { + wgcna$net$unmergedColors <- newcol(wgcna$net$unmergedColors) + } + names(wgcna$net$MEs) <- newcol(names(wgcna$net$MEs)) + + ## rename everything in wgcna object + names(wgcna$me.genes) <- newcol(names(wgcna$me.genes)) + wgcna$me.colors <- newcol(wgcna$me.colors) + names(wgcna$me.colors) <- newcol(names(wgcna$me.colors)) + colnames(wgcna$W) <- newcol(colnames(wgcna$W)) + if(!is.null(wgcna$modTraits)) { + rownames(wgcna$modTraits) <- newcol(rownames(wgcna$modTraits)) + } + + ## rename everything in stats object + if("stats" %in% names(wgcna) && !is.null(wgcna$stats)) { + rownames(wgcna$stats[['moduleTraitCor']]) <- newcol(rownames(wgcna$stats[['moduleTraitCor']])) + rownames(wgcna$stats[['moduleTraitPvalue']]) <- newcol(rownames(wgcna$stats[['moduleTraitPvalue']])) + colnames(wgcna$stats[['moduleMembership']]) <- newcol(colnames(wgcna$stats[['moduleMembership']])) + colnames(wgcna$stats[['MMPvalue']]) <- newcol(colnames(wgcna$stats[['MMPvalue']])) + } + return(wgcna) +} + +#' +#' +#' @export +wgcna.getModuleCrossGenes <- function(wgcna, ref=NULL, ngenes = 100, + multi=TRUE, modules=NULL) +{ + + if(!multi) { + wgcna <- list(gx = wgcna) + ref <- 'gx' + } + + if(is.null(ref)) ref <- head(intersect(names(wgcna),c("gx","px")),1) + if(is.null(ref) || !ref %in% names(wgcna)) ref <- names(wgcna)[1] + + W <- wgcna[[ref]] + geneX <- W$datExpr + + MEx <- sapply(wgcna, function(w) as.matrix(w$net$MEs)) + MEx <- do.call(cbind, MEx) + + if(!is.null(modules)) { + modules <- intersect(modules, colnames(MEx)) + MEx <- MEx[,modules,drop=FALSE] + } + + nbx.cor <- cor(geneX, MEx) + + nbx.list <- list() + for(k in colnames(nbx.cor)) { + ii <- head(order(-nbx.cor[,k]), ngenes) + rho <- nbx.cor[ii,k] + gene <- rownames(nbx.cor)[ii] + me <- W$net$labels[gene] + nbx.list[[k]] <- data.frame( gene = gene, rho = rho, module = me) + } + + ##if(length(nbx.list)==1) nbx.list <- nbx.list[[1]] + return(nbx.list) +} + ##========================================================================= ## CONSENSUS WGCNA @@ -1999,6 +1911,7 @@ wgcna.runConsensusWGCNA <- function(exprList, ctx <- makeContrastsFromLabelMatrix(contrasts) ctx <- sign(ctx) ctx[ctx==0] <- NA + ctx[ctx==-1] <- 0 datTraits <- cbind( datTraits, ctx) } @@ -2046,11 +1959,12 @@ wgcna.runConsensusWGCNA <- function(exprList, if(!is.null(progress)) progress$inc(0.2, "Computing enrichment...") message("[wgcna.runConsensusWGCNA] >>> computing module enrichment...") if(!is.null(GMT)) { - GMT0 <- Matrix::t(playdata::GSETxGENE) + GMT0 <- getPlaydataGMT() if(!is.null(annot)) GMT0 <- rename_by2(GMT0, annot, "symbol") GMT <- merge_sparse_matrix(GMT, GMT0) + remove(GMT0) } else { - GMT <- Matrix::t(playdata::GSETxGENE) + GMT <- getPlaydataGMT() if(!is.null(annot)) GMT <- rename_by2(GMT, annot, "symbol") } res$gsea <- wgcna.computeConsensusModuleEnrichment( @@ -2067,7 +1981,7 @@ wgcna.runConsensusWGCNA <- function(exprList, ai <- wgcna.describeModules( res, multi = FALSE, - ntop = 25, + ntop = 50, model = ai_model, annot = annot, experiment = ai_experiment, @@ -2081,51 +1995,6 @@ wgcna.runConsensusWGCNA <- function(exprList, res } -#' @export -wgcna.matchColors <- function(wgcna, refcolors) { - - oldcolors <- wgcna$net$colors - newcolors <- WGCNA::matchLabels(oldcolors, refcolors) - lut <- table(oldcolors, newcolors) - old2new <- colnames(lut)[max.col(lut)] - names(old2new) <- rownames(lut) - prefix <- substring(names(wgcna$me.colors),1,2)[1] - old2newME <- paste0(prefix,old2new) - names(old2newME) <- paste0(prefix,names(old2new)) - old2new <- c(old2new, old2newME) - - newcol <- function(x) { - array( old2new[x], dimnames = list(names(x))) - } - - ## rename everything in net object - wgcna$net$colors <- newcol(wgcna$net$colors) - if("labels" %in% names(wgcna$net)) { - wgcna$net$labels <- newcol(wgcna$net$labels) - } - - ## rename unmergedColors - if("unmergedColors" %in% names(wgcna$net)) { - wgcna$net$unmergedColors <- newcol(wgcna$net$unmergedColors) - } - names(wgcna$net$MEs) <- newcol(names(wgcna$net$MEs)) - - ## rename everything in wgcna object - names(wgcna$me.genes) <- newcol(names(wgcna$me.genes)) - wgcna$me.colors <- newcol(wgcna$me.colors) - names(wgcna$me.colors) <- newcol(names(wgcna$me.colors)) - colnames(wgcna$W) <- newcol(colnames(wgcna$W)) - rownames(wgcna$modTraits) <- newcol(rownames(wgcna$modTraits)) - - ## rename everything in stats object - if("stats" %in% names(wgcna) && !is.null(wgcna$stats)) { - rownames(wgcna$stats[['moduleTraitCor']]) <- newcol(rownames(wgcna$stats[['moduleTraitCor']])) - rownames(wgcna$stats[['moduleTraitPvalue']]) <- newcol(rownames(wgcna$stats[['moduleTraitPvalue']])) - colnames(wgcna$stats[['moduleMembership']]) <- newcol(colnames(wgcna$stats[['moduleMembership']])) - colnames(wgcna$stats[['MMPvalue']]) <- newcol(colnames(wgcna$stats[['MMPvalue']])) - } - return(wgcna) -} #' @export wgcna.createConsensusLayers <- function(exprList, @@ -2236,7 +2105,112 @@ wgcna.createConsensusLayers <- function(exprList, aligned[[k]] <- w } - return(aligned) + return(aligned) +} + +#' Compute gene statistics with original datExpr but with consensus +#' colors/labels for each layers. A separate function +#' wgcna.getConsensusGeneStats() extracts clean tables from this +#' results object. +#' +#' @export +wgcna.computeConsensusGeneStats <- function(cons) { + k <- names(cons$layers)[1] + stats <- list() + for(k in names(cons$layers)) { + w <- cons$layers[[k]] + colors <- cons$net$colors + wMEs <- cons$net$multiMEs[[k]]$data + wnet <- list( MEs = wMEs, colors = colors) + stats[[k]] <- wgcna.computeGeneStats( + wnet, w$datExpr, w$datTraits, TOM=NULL) + } + return(stats) +} + +#' +#' +#' @export +wgcna.getConsensusGeneStats <- function(cons, stats, trait, module=NULL) { + + ## create extended color vector + labels = paste0("ME",cons$net$colors) + gstats <- list() + for(k in names(stats)) { + gstats[[k]] <- wgcna.getGeneStats( + wgcna = NULL, + stats = stats[[k]], + labels = labels, + trait = trait, + plot = FALSE, + module = module, + col = NULL, + main = NULL + ) + } + + ## Align rows + ff <- gstats[[1]]$feature + for(k in names(gstats)) { + ii <- match(ff, gstats[[k]]$feature) + gstats[[k]] <- gstats[[k]][ii,] + } + + ## Compute consensus statistics. Consensus statistics are computed + ## as geometric mean of score variables, and/or maximum pvalue for + ## p.value columns. + xcols <- c(3,4,6,8) + pcols <- c(10,5,7,9) + pcols1 <- c(5,7,9) + xcols <- c("score","moduleMembership","traitSignificance","foldChange") + pcols <- c("scorePvalue","MMPvalue","TSPvalue","foldChangePvalue") + pcols1 <- pcols[-1] + for(i in 1:length(gstats)) { + gstats[[i]][,'scorePvalue'] <- apply(gstats[[i]][,pcols1],1,max,na.rm=TRUE) + } + ##xc <- lapply(gstats, function(x) log(abs(x[,xcols])*(x[,pcols]<0.05))) + xc <- lapply(gstats, function(x) log(abs(x[,xcols]))) + xc <- exp(Reduce('+', xc) / length(xc)) + xp <- Reduce(pmax, lapply(gstats, function(x) x[,pcols])) + df3 <- data.frame( gstats[[1]][,1:2], xc, xp) + df3 <- df3[,colnames(gstats[[1]])] + head(df3) + + ## Determine consensus status. Feature is 'C' (concordant) if sign + ## in all layers are equal and significant. 'D' (discordant) if sign + ## if not equal in all layers but significant. 'N' is any is + ## non-significant. + sign.pos <- Reduce('*',lapply(gstats,function(g) sign(g$score) == 1)) + sign.neg <- Reduce('*',lapply(gstats,function(g) sign(g$score) == -1)) + allsig <- Reduce('*',lapply(gstats,function(g) (g$scorePvalue) < 0.05)) + table(allsig) + consensus <- c("D","C")[ 1 + 1*(sign.pos | sign.neg)] + consensus[which(allsig==0)] <- 'N' + cons.df <- data.frame(df3[,1:2], consensus, df3[,-c(1,2)]) + head(cons.df) + + ## This creates the full stats matrix (all subgroups) + df1 <- gstats[[1]][,c("feature","module")] + df2 <- gstats[[1]][,0] + cols <- colnames(gstats[[1]])[-c(1:2)] + for(k in cols ) { + xx <- sapply(gstats, function(g) g[,k]) + df2[[k]] <- I(xx) + } + df2 <- do.call(cbind, lapply(df2,unclass)) + newcols <- unlist(lapply(cols, function(k) paste0(k,'.',names(gstats)))) + colnames(df2) <- newcols + full.df <- data.frame(df1, consensus=cons.df$consensus, df2) + + ## sort?? + ii <- order(-cons.df$score * sign(mean(cons.df$score,na.rm=TRUE))) + cons.df <- cons.df[ii,] + full.df <- full.df[ii,] + + list( + consensus = cons.df, + full = full.df + ) } @@ -2286,7 +2260,7 @@ wgcna.computeConsensusMatrix <- function(matlist, ydim, psig = 0.05, consfun="mi if(psig < 1) { ## enforce strong consensus. All layers must be strictly ## significant. - all.sig <- Reduce("*", lapply(pv, function(p) 1 * (p < psig))) + all.sig <- Reduce("*", lapply(pv, function(p) 1 * (p <= psig))) consZ[!all.sig] <- NA } return(consZ) @@ -2300,7 +2274,7 @@ wgcna.computeConsensusMatrix <- function(matlist, ydim, psig = 0.05, consfun="mi wgcna.computeDistinctMatrix <- function(matlist, ydim, psig = 0.05, min.diff=0.3, consmax = 0) { ## create difference module-trait matrix - pv <- mapply(function(z, n) corPvalueStudent(z, n), + pv <- mapply(function(z, n) WGCNA::corPvalueStudent(z, n), matlist, ydim, SIMPLIFY = FALSE) matsign <- lapply( matlist, sign ) Q <- matlist @@ -2326,6 +2300,118 @@ wgcna.computeDistinctMatrix <- function(matlist, ydim, psig = 0.05, min.diff=0.3 return(Q) } +#' Compute consensus enrichment by calculating overlapping enriched +#' terms. +#' +wgcna.computeConsensusModuleEnrichment <- function(cons, + GMT, + annot, + methods = c("fisher","gsetcor","xcor"), + min.genes = 3, + ntop = 400 ) +{ + if(0) { + methods = c("fisher","gsetcor","xcor") + min.genes = 3 + ntop = 400 + annot = NULL + GMT <- Matrix::t(playdata::GSETxGENE) + } + + if(is.null(GMT)) { + message("ERROR: must provide GMT") + return(NULL) + } + + gseaX <- list() + i=1 + for(i in 1:length(cons$datExpr)) { + + geneX <- t(cons$datExpr[[i]]) + dim(geneX) + + ## Rename everything to symbols + if(!is.null(annot)) { + geneX <- rename_by2(geneX, annot, "symbol") + GMT <- rename_by2(GMT, annot, "symbol") + } + ng <- length(intersect(rownames(geneX), rownames(GMT))) + if(ng == 0) { + message("[wgcna.computeConsensusModuleEnrichment] ERROR. No symbol overlap.") + return(NULL) + } + symbols <- intersect(rownames(GMT),rownames(geneX)) + message("[wgcna.computeConsensusModuleEnrichment] number of symbols: ", length(symbols)) + geneX <- geneX[symbols,] + GMT <- GMT[symbols,] + + ## select on minimum gene sets size + sel <- which( Matrix::colSums(GMT!=0) >= min.genes ) + GMT <- GMT[,sel] + + ## Create extended Eigengene matrix (ME). ME should be nicely + ## normalized/scaled so we just rbind across datasets + ME <- cons$net$multiMEs[[i]]$data + dim(ME) + + ## get genes in modules + me.genes <- tapply(names(cons$net$colors), cons$net$colors, list) + names(me.genes) <- paste0("ME",names(me.genes)) + if(!is.null(annot)) { + me.genes <- lapply(me.genes, function(gg) probe2symbol(gg, annot)) + } + me.genes <- lapply(me.genes, function(g) intersect(g, symbols)) + rownames(ME) + colnames(geneX) <- rownames(ME) + + k <- names(cons$datExpr)[i] + gseaX[[k]] <- wgcna.run_enrichment_methods( + ME, + me.genes = me.genes, + GMT= GMT, + geneX = geneX, + methods = methods, + min.genes = min.genes, + ntop = ntop) + } + + cons.gsea <- list() + m=1 + for(m in names(gseaX[[1]])) { + xx <- lapply( gseaX, function(g) g[[m]] ) + sel <- Reduce(intersect, lapply(xx, rownames)) + if(length(sel) > 0) { + if(length(sel)==1) sel <- c(sel,sel) ## length==1 crashes... + xx <- lapply(xx, function(x) x[sel,,drop=FALSE] ) + xx.score <- sapply(xx, function(x) x[,"score"]) + colnames(xx.score) <- paste0("score.",colnames(xx.score)) + + xx.pvalue <- lapply(xx, function(x) x[,grep("^p",colnames(x))]) + xx.pvalue <- do.call(cbind, xx.pvalue) + + m.score <- rowMeans(xx.score,na.rm=TRUE) + m.pvalue <- apply(sapply(xx, function(x) x[,"p.value"]), 1, max, na.rm=TRUE) + m.qvalue <- p.adjust( m.pvalue ) + df <- data.frame( + module = xx[[1]]$module, + geneset = xx[[1]]$geneset, + score = m.score, + xx.score, + p.value = m.pvalue, + q.value = m.qvalue, + overlap = xx[[1]]$overlap, + genes = xx[[1]]$genes, + xx.pvalue + ) + df <- df[order(df$p.value),] + #df <- df[!duplicated(df$geneset),,drop=FALSE] + cons.gsea[[m]] <- df + } + } + + return(cons.gsea) +} + #' @export wgcna.plotConsensusOverlapHeatmap <- function(net1, net2, @@ -2546,13 +2632,13 @@ wgcna.runPreservationWGCNA <- function(exprList, ## geneset enrichment of reference layer if(compute.enrichment) { message("[wgcna.runPreservationWGCNA] computing geneset enrichment...") - if(!is.null(GMT)) { - GMT0 <- Matrix::t(playdata::GSETxGENE) + GMT0 <- getPlaydataGMT() if(!is.null(annot)) GMT0 <- rename_by2(GMT0, annot, "human_ortholog") GMT <- merge_sparse_matrix(GMT, GMT0) + remove(GMT0) } else { - GMT <- Matrix::t(playdata::GSETxGENE) + GMT <- getPlaydataGMT() if(!is.null(annot)) GMT <- rename_by2(GMT, annot, "human_ortholog") } @@ -2560,7 +2646,6 @@ wgcna.runPreservationWGCNA <- function(exprList, pres$gsea <- wgcna.computeModuleEnrichment( pres$layers[[ref]], GMT = GMT, - gsetX = NULL, annot = annot, methods = gset.methods, ntop = 1000, @@ -3201,6 +3286,7 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, show.contrasts = FALSE, use.tree = 0, rm.na = TRUE, + sd.wt = 0, marAll = c(0.4, 5, 1, 0.2), setLayout=TRUE, ... ) { @@ -3251,6 +3337,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, X <- wgcna$datExpr Y <- net$MEs kme <- cor(X, Y, use="pairwise") + sdx <- matrixStats::colSds(X,na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,] colors <- cbind(colors, 0, kmeColors) @@ -3261,6 +3349,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, Y <- Y[,grep("_vs_",colnames(Y),invert=TRUE),drop=FALSE] if(NCOL(Y)>0) { kme <- cor(X, Y, use="pairwise") + sdx <- matrixStats::colSds(X,na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,,drop=FALSE] colors <- cbind(colors, 0, kmeColors) @@ -3272,6 +3362,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, Y <- Y[,grep("_vs_",colnames(Y)),drop=FALSE] if(NCOL(Y)>0) { kme <- cor(X, Y, use="pairwise") + sdx <- matrixStats::colSds(X,na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,,drop=FALSE] colors <- cbind(colors, 0, kmeColors) @@ -3285,6 +3377,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, Y <- wgcna$net$multiMEs for(i in 1:length(X)) { kme <- cor(X[[i]], Y[[i]]$data, use="pairwise") + sdx <- matrixStats::colSds(X[[i]],na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,,drop=FALSE] colnames(kmeColors) <- paste0(names(X)[i],":",colnames(kmeColors)) @@ -3299,6 +3393,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, Y1 <- Y1[,grep("_vs_",colnames(Y1),invert=TRUE),drop=FALSE] if(NCOL(Y1)) { kme <- cor(X[[i]], Y1, use="pairwise") + sdx <- matrixStats::colSds(X[[i]],na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,,drop=FALSE] colnames(kmeColors) <- paste0(names(X)[i],":",colnames(kmeColors)) @@ -3314,6 +3410,8 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, Y1 <- Y1[,grep("_vs_",colnames(Y1)),drop=FALSE] if(NCOL(Y1)) { kme <- cor(X[[i]], Y1, use="pairwise") + sdx <- matrixStats::colSds(X[[i]],na.rm=TRUE) + if(sd.wt>0) kme <- kme * (sdx / max(abs(sdx),na.rm=TRUE))**sd.wt kmeColors <- rho2bluered(kme) kmeColors <- kmeColors[gg,,drop=FALSE] colnames(kmeColors) <- paste0(names(X)[i],":",colnames(kmeColors)) @@ -3321,7 +3419,6 @@ wgcna.plotDendroAndColors <- function(wgcna, main=NULL, block=1, } } } - } if(rm.na) { @@ -3360,6 +3457,7 @@ wgcna.plotMultiDendroAndColors <- function(multi_wgcna, show.contrasts = FALSE, use.tree = 0, rm.na = TRUE, + sd.wt = 0, main = NULL, colorHeight = 0.5, marAll = c(0.4,5,1,0.2) @@ -3386,6 +3484,7 @@ wgcna.plotMultiDendroAndColors <- function(multi_wgcna, show.contrasts = show.contrasts, show.kme = show.kme, use.tree = use.tree, + sd.wt = sd.wt, setLayout = FALSE, main = main[k] ) @@ -3679,118 +3778,130 @@ wgcna.labels2colors <- function(colors, ...) { #' Plot membership correlation vs gene signficance (correlation with #' trait) to discover biomarkers/driver genes. #' -#' @export -wgcna.plotMMvsGS <- function(wgcna, module, trait, abs = TRUE, par = TRUE, - plotlib = "base") { - ## module="ME3";trait="activated=act" - moduleGenes <- wgcna$me.genes[[module]] - nSamples <- nrow(wgcna$datExpr) - - ## Module membership correlation (with p-values) - if ("stats" %in% names(wgcna)) { - moduleMembership <- wgcna$stats$moduleMembership - MMPvalue <- wgcna$stats$MMPvalue - } else { - moduleMembership <- as.data.frame(cor(wgcna$datExpr, wgcna$net$MEs, use = "p")) - MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(moduleMembership), nSamples)) - } - - ## Gene-trait significance (trait correlation) (with p-values) - if ("stats" %in% names(wgcna)) { - traitSignificance <- wgcna$stats$traitSignificance - TSPvalue <- wgcna$stats$TSPvalue - } else { - traitSignificance <- as.data.frame(cor(wgcna$datExpr, wgcna$datTraits, use = "p")) - TSPvalue <- as.data.frame(corPvalueStudent(as.matrix(traitSignificance), nSamples)) - } - - x <- (moduleMembership[moduleGenes, module]) - y <- (traitSignificance[moduleGenes, trait]) - if (abs == TRUE) { - x <- abs(x) - y <- abs(y) - } - ## - px <- MMPvalue[moduleGenes, module] - py <- TSPvalue[moduleGenes, trait] - qx <- p.adjust(px, method = "fdr") - qy <- p.adjust(py, method = "fdr") - is.sig <- (qx < 0.05 & qy < 0.05) - sigx <- (qx < 0.05) - sigy <- (qy < 0.05) - ii <- which(is.sig) - qv <- quantile(x[ii], prob = 0.1)[1] - qh <- quantile(y[ii], prob = 0.1)[1] - - pos <- cbind(x, y) - rownames(pos) <- moduleGenes - is.sig1 <- c("notsig", "onesig", "sig")[1 + 1 * sigx + 1 * sigy] - hi1 <- NULL - ## hi1 <- head(rownames(pos),10) - col1 <- c("grey70", "grey20", "red2") - - if (par) par(mfrow = c(1, 1), mar = c(5, 5, 3, 2)) - if (plotlib == "ggplot") { - pgx.scatterPlotXY.GGPLOT( - pos, - var = is.sig1, hilight = hi1, col = col1, - xlab = paste("Module membership in", module, "module"), - ylab = paste("Gene significance for trait", trait), - title = paste("Module membership vs. gene significance\n"), - cex.title = 0.9, - girafe = FALSE - ) - } else if (plotlib == "girafe") { - pgx.scatterPlotXY.GGPLOT( - pos, - var = is.sig1, hilight = hi1, col = col1, - xlab = paste("Module membership in", module, "module"), - ylab = paste("Gene significance for trait", trait), - title = paste("Module membership vs. gene significance\n"), - cex.title = 0.7, cex.axis = 0.7, - girafe = TRUE - ) - } else { - ii <- which(is.sig1 == "notsig") - verboseScatterplot( - x[-ii], y[-ii], - xlab = paste("Module membership in", module, "module"), - ylab = paste("Gene significance for trait", trait), - main = paste("Module membership vs. gene significance\n"), - cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = col1[1] - ) - ii <- which(is.sig1 == "onesig") - points(x[ii], y[ii], col = col1[2]) - ii <- which(is.sig1 == "sig") - points(x[ii], y[ii], col = col1[3]) - abline(v = qv, h = qh, col = "darkred") - } -} +## wgcna.plotMMvsGS <- function(wgcna, module, trait, abs = TRUE, par = TRUE, +## plotlib = "base") { +## ## module="ME3";trait="activated=act" +## moduleGenes <- wgcna$me.genes[[module]] +## nSamples <- nrow(wgcna$datExpr) + +## ## Module membership correlation (with p-values) +## if ("stats" %in% names(wgcna)) { +## moduleMembership <- wgcna$stats$moduleMembership +## MMPvalue <- wgcna$stats$MMPvalue +## } else { +## moduleMembership <- as.data.frame(cor(wgcna$datExpr, wgcna$net$MEs, use = "p")) +## MMPvalue <- as.data.frame(WGCNA::corPvalueStudent(as.matrix(moduleMembership), nSamples)) +## } + +## ## Gene-trait significance (trait correlation) (with p-values) +## if ("stats" %in% names(wgcna)) { +## traitSignificance <- wgcna$stats$traitSignificance +## TSPvalue <- wgcna$stats$TSPvalue +## } else { +## traitSignificance <- as.data.frame(cor(wgcna$datExpr, wgcna$datTraits, use = "p")) +## TSPvalue <- as.data.frame(WGCNA::corPvalueStudent(as.matrix(traitSignificance), nSamples)) +## } + +## x <- (moduleMembership[moduleGenes, module]) +## y <- (traitSignificance[moduleGenes, trait]) +## if (abs == TRUE) { +## x <- abs(x) +## y <- abs(y) +## } +## ## +## px <- MMPvalue[moduleGenes, module] +## py <- TSPvalue[moduleGenes, trait] +## qx <- p.adjust(px, method = "fdr") +## qy <- p.adjust(py, method = "fdr") +## is.sig <- (qx < 0.05 & qy < 0.05) +## sigx <- (qx < 0.05) +## sigy <- (qy < 0.05) +## ii <- which(is.sig) +## qv <- quantile(x[ii], prob = 0.1)[1] +## qh <- quantile(y[ii], prob = 0.1)[1] + +## pos <- cbind(x, y) +## rownames(pos) <- moduleGenes +## is.sig1 <- c("notsig", "onesig", "sig")[1 + 1 * sigx + 1 * sigy] +## hi1 <- NULL +## ## hi1 <- head(rownames(pos),10) +## col1 <- c("grey70", "grey20", "red2") + +## if (par) par(mfrow = c(1, 1), mar = c(5, 5, 3, 2)) +## if (plotlib == "ggplot") { +## pgx.scatterPlotXY.GGPLOT( +## pos, +## var = is.sig1, hilight = hi1, col = col1, +## xlab = paste("Module membership in", module, "module"), +## ylab = paste("Gene significance for trait", trait), +## title = paste("Module membership vs. gene significance\n"), +## cex.title = 0.9, +## girafe = FALSE +## ) +## } else if (plotlib == "girafe") { +## pgx.scatterPlotXY.GGPLOT( +## pos, +## var = is.sig1, hilight = hi1, col = col1, +## xlab = paste("Module membership in", module, "module"), +## ylab = paste("Gene significance for trait", trait), +## title = paste("Module membership vs. gene significance\n"), +## cex.title = 0.7, cex.axis = 0.7, +## girafe = TRUE +## ) +## } else { +## ii <- which(is.sig1 == "notsig") +## verboseScatterplot( +## x[-ii], y[-ii], +## xlab = paste("Module membership in", module, "module"), +## ylab = paste("Gene significance for trait", trait), +## main = paste("Module membership vs. gene significance\n"), +## cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = col1[1] +## ) +## ii <- which(is.sig1 == "onesig") +## points(x[ii], y[ii], col = col1[2]) +## ii <- which(is.sig1 == "sig") +## points(x[ii], y[ii], col = col1[3]) +## abline(v = qv, h = qh, col = "darkred") +## } +## } #' @export wgcna.plotModuleTraitHeatmap <- function(wgcna, setpar = TRUE, cluster = FALSE, multi = FALSE, main = NULL, justdata = FALSE, transpose = FALSE, colorlabel = TRUE, + show = c("both","traits","contrasts")[1], nmax = -1, tmax = -1, text = TRUE, pstar = TRUE) { if(!multi) wgcna <- list(wgcna) - nSamples <- nrow(wgcna[[1]]$datExpr) - MEs <- do.call(cbind, lapply(wgcna, function(w) as.matrix(w$net$MEs))) + MEs <- lapply(wgcna, function(w) as.matrix(w$net$MEs)) + MEs <- wgcna.mergeME(MEs) + Y <- wgcna[[1]]$datTraits + sel <- 1:ncol(Y) + if(show=="traits") sel <- grep("_vs_",colnames(Y),invert=TRUE) + if(show=="contrasts") sel <- grep("_vs_",colnames(Y)) + Y <- Y[,sel,drop=FALSE] moduleTraitCor <- cor(MEs, Y, use = "pairwise.complete") + + #nSamples <- nrow(wgcna[[1]]$datExpr) + nSamples <- t(!is.na(MEs)) %*% (!is.na(Y)) + if(nmax > 0) { sel <- head(order(-apply(abs(moduleTraitCor), 1, max, na.rm=TRUE)),nmax) moduleTraitCor <- moduleTraitCor[sel,,drop=FALSE] + nSamples <- nSamples[sel,,drop=FALSE] } if(tmax > 0) { sel <- head(order(-apply(abs(moduleTraitCor), 2, max, na.rm=TRUE)),tmax) moduleTraitCor <- moduleTraitCor[,sel,drop=FALSE] + nSamples <- nSamples[,sel,drop=FALSE] } if(transpose) { moduleTraitCor <- t(moduleTraitCor) + nSamples <- t(nSamples) } wgcna.plotLabeledCorrelationHeatmap( @@ -3803,7 +3914,8 @@ wgcna.plotModuleTraitHeatmap <- function(wgcna, setpar = TRUE, cluster = FALSE, justdata = justdata, colorlabel = colorlabel, pstar = pstar - ) + ) + } @@ -3829,7 +3941,7 @@ wgcna.plotEigenGeneClusterDendrogram <- function(wgcna = NULL, if(multi) { ME <- lapply(wgcna, function(w) as.matrix(w$net$MEs)) - ME <- do.call(cbind, ME) + ME <- wgcna.mergeME(ME) Y <- wgcna[[1]]$datTraits } else { ME <- wgcna$net$MEs @@ -3837,10 +3949,10 @@ wgcna.plotEigenGeneClusterDendrogram <- function(wgcna = NULL, } if (length(add_traits)==1 && is.logical(add_traits) && add_traits==TRUE) { - ME <- cbind(ME, Y) + ME <- wgcna.mergeME(ME, Y) } else if (length(add_traits) > 0 && !is.logical(add_traits)) { sel <- intersect(add_traits, colnames(Y)) - if(length(sel)) ME <- cbind(ME, Y[,sel]) + if(length(sel)) ME <- wgcna.mergeME(ME, Y[,sel]) } } @@ -3890,6 +4002,7 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, colorlabel = TRUE, text = FALSE, pstar = TRUE, + power = 1, setMargins = TRUE, mar1 = c(5.6, 4.5, 1.8, 0), mar2 = c(8, 10, 4, 2), @@ -3930,15 +4043,14 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, justdata = FALSE add_me = TRUE } - if(!multi) wgcna <- list(gx=wgcna) # Matrix with eigengenes and traits - ME <- c() + ME <- NULL if(add_me) { ME <- lapply(wgcna, function(w) as.matrix(w$net$MEs)) - ME <- do.call(cbind, ME) + ME <- wgcna.mergeME(ME) } Y <- wgcna[[1]]$datTraits @@ -3947,21 +4059,34 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, if(!is.null(traits)) { sel <- intersect(traits, sel) } - ME <- cbind(ME, Y[,sel,drop=FALSE]) + if(is.null(ME)) { + ME <- Y[,sel,drop=FALSE] + } else { + ME <- wgcna.mergeME(ME, Y[,sel,drop=FALSE]) + } } if (!add_traits && !is.null(phenotype)) { - ME <- cbind(ME, Y[,phenotype,drop=FALSE]) + if(is.null(ME)) { + ME <- Y[,phenotype,drop=FALSE] + } else { + ME <- wgcna.mergeME(ME, Y[,phenotype,drop=FALSE]) + } } if (NCOL(ME) <= 2) ME <- cbind(ME, ME) ## error if ncol(ME)<=2 !!!! - ## eigengene correlation - impME <- svdImpute2(as.matrix(ME)) - R <- cor(impME, use="pairwise") - R0 <- R - - ## If phenotype is given we condition the heatmap using the + ## Compute eigengene correlation matrix. Repeat 'power' times for + ## higher order adjacency. + power <- round(power) + R <- ME + for(i in 1:power) { + tt <- cortest(R, R) + R <- tt$rho + nSamples <- tt$n + } + + ## If phenotype is given, we condition the heatmap using the ## correlation to the phenotype. if(!is.null(phenotype)) { ## proper sign in case of inhibitor layer (like miRNA) @@ -3971,7 +4096,6 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, ff <- list() for(k in names(wgcna)) { rho <- cor(ME, Y[,phenotype], use="pairwise")[,1] - ##rho <- wgcna[[k]]$modTraits[,phenotype] ff[[k]] <- layersign[k] * rho } names(ff) <- NULL @@ -3988,7 +4112,9 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, if(nmax>0) { if(!is.null(phenotype)) { - rho <- cor(ME, Y[,phenotype], use="pairwise")[,1] + y1 <- Y[,phenotype] + y1 <- y1[match(rownames(ME),names(y1))] + rho <- cor(ME, y1, use="pairwise")[,1] ii <- head(order(-abs(rho)), nmax) } else { ii <- head( order(-Matrix::rowMeans(R**2)), nmax) @@ -4014,11 +4140,14 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, if(plotDendro) par(mar=mar1) #fixclust=FALSE + R0 <- R + R0[is.na(R0)] <- 0 + if(fixclust) { ii <- rownames(R) hc <- hclust(as.dist(1 - R0[ii,ii]), method="average") } else { - hc <- hclust(as.dist(1 - R), method="average") + hc <- hclust(as.dist(1 - R0), method="average") } if(plotDendro) { par(cex=cex.lab) @@ -4031,7 +4160,8 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, ii <- hc$labels[hc$order] ii <- intersect(ii, rownames(R)) R1 <- R[rev(ii), ii] - nsamples <- nrow(Y) + #nsamples <- nrow(Y) + nsamples <- nSamples[rownames(R1),colnames(R1)] par(mar=mar2) wgcna.plotLabeledCorrelationHeatmap( R1, @@ -4045,7 +4175,8 @@ wgcna.plotEigenGeneAdjacencyHeatmap <- function(wgcna, cex.lab = cex.lab, cex.text = cex.text ) - } + } + invisible(R) } #' @export @@ -4149,7 +4280,7 @@ wgcna.plotEigenGeneGraph <- function(wgcna, add_traits = TRUE, main = NULL, ## require(igraph) if(multi) { ME <- lapply(wgcna, function(w) as.matrix(w$net$MEs)) - ME <- do.call(cbind, ME) + ME <- wgcna.mergeME(ME) if (add_traits) ME <- cbind(ME, wgcna[[1]]$datTraits) } else { ME <- wgcna$net$MEs @@ -4434,9 +4565,15 @@ wgcna.plotLabeledCorrelationHeatmap <- function(R, nSamples, } R <- R[ii, jj] } + R0 <- pmax(pmin(R, 1, na.rm=TRUE), -1, na.rm=TRUE) - Pvalue <- corPvalueStudent(R0, nSamples) - + ii <- which(nSamples < 3) + nSamples <- pmax(nSamples, 3) + Pvalue <- WGCNA::corPvalueStudent(R0, nSamples) + if(is.matrix(nSamples) && length(ii)>0) { + Pvalue[ii] <- NA + } + if (justdata) { return(R) } @@ -5121,8 +5258,8 @@ wgcna.scaleTOMs <- function(TOMs, scaleP=0.95) { #' @export wgcna.getTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, - level="gene", rename="symbol") { - + psig = 0.05, level="gene", rename="symbol") { + if("layers" %in% names(wgcna) && class(wgcna$datExpr) == "list") { cons <- wgcna.getConsensusTopGenesAndSets(wgcna, annot=annot, module=module, ntop=ntop, rename=rename) @@ -5133,27 +5270,41 @@ wgcna.getTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, message("[wgcna.getTopGenesAndSets] Error: no stats object") return(NULL) } - if(!"gsea" %in% names(wgcna)) warning("object has no enrichment results (gsea)") + if(!"gsea" %in% names(wgcna)) warning("object has no enrichment results (gsea)") - ## get top genes (highest kME) + ## get top genes by centrality-weighted-meanFC2 mm <- wgcna$stats$moduleMembership - if(!is.null(annot)) mm <- rename_by2(mm, annot, new_id=rename) + mm.sig <- 1*(wgcna$stats$MMPvalue <= psig) + ff <- sqrt(rowMeans(wgcna$stats$foldChange**2, na.rm=TRUE)) + mm <- mm * mm.sig * ff + if(!is.null(annot)) { + annot$gene_title <- paste0(annot$gene_title," (",annot$symbol,")") + mm <- rename_by2(mm, annot, new_id=rename) + } gg <- rownames(mm) mm <- as.list(data.frame(mm)) if(!is.null(module)) mm <- mm[which(names(mm) %in% module)] - sel.topgenes <- lapply(mm, function(x) head(order(-x),ntop) ) - topgenes <- lapply( sel.topgenes, function(i) gg[i]) + for(i in 1:length(mm)) names(mm[[i]]) <- gg + mm <- lapply(mm, function(x) x[x!=0]) + topgenes <- lapply(mm, function(x) names(head(sort(-x),ntop))) ## top genesets topsets <- NULL if("gsea" %in% names(wgcna)) { ee <- wgcna$gsea - if(!is.null(module)) ee <- ee[which(names(ee) %in% module)] - topsets <- lapply(ee,function(x) head(rownames(x),ntop)) + if(!all(sapply(ee,is.null))) { + if(!is.null(module)) ee <- ee[which(names(ee) %in% module)] + ee <- lapply(ee, function(x) x[which(x$p.value <= psig),] ) + topsets <- lapply(ee,function(x) head(rownames(x),ntop)) + } } ## top correlated phenotypes - M <- wgcna$modTraits + if(!is.null(wgcna$modTraits)) { + M <- wgcna$modTraits + } else { + M <- cor( wgcna$net$MEs, wgcna$datTraits, use="pairwise") + } toppheno <- apply(M, 1, function(x) names(which(x > 0.8*max(x)))) if(level=="geneset") { @@ -5167,22 +5318,21 @@ wgcna.getTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, #' @export wgcna.getMultiTopGenesAndSets <- function(multi_wgcna, annot=NULL, module=NULL, - ntop=40, level="gene", rename="symbol") { + psig=0.05, ntop=40, level="gene", rename="symbol") { toplist <- list() k=names(multi_wgcna)[1] for(k in names(multi_wgcna)) { topk <- wgcna.getTopGenesAndSets( multi_wgcna[[k]], module=module, annot=annot, - ntop=ntop, level=level, rename=rename) + ntop=ntop, psig=psig, level=level, rename=rename) if(!is.null(module)) { topk <- lapply( topk, function(s) s[which(names(s) %in% module)] ) } toplist[[k]] <- topk } - top <- list() - + top <- list() top$genes <- lapply(toplist, function(t) t[['genes']]) names(top$genes) <- NULL top$genes <- unlist(top$genes, recursive=FALSE) @@ -5203,17 +5353,23 @@ wgcna.getMultiTopGenesAndSets <- function(multi_wgcna, annot=NULL, module=NULL, #' @export -wgcna.getConsensusTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, +wgcna.getConsensusTopGenesAndSets <- function(cons, annot=NULL, module=NULL, ntop=40, level=c("gene","geneset")[1], rename="symbol" ) { - if(!"stats" %in% names(wgcna)) stop("object has no stats") - if(!"gsea" %in% names(wgcna)) warning("object has no enrichment results (gsea)") + if(!"stats" %in% names(cons)) stop("object has no stats") + if(!"gsea" %in% names(cons)) warning("object has no enrichment results (gsea)") + if(!is.null(annot)) { + annot$gene_title <- paste0(annot$gene_title," (",annot$symbol,")") + } + ## get top genes (highest kME) topgenesx <- list() - for(i in 1:length(wgcna$stats)) { - mm <- wgcna$stats[[i]]$moduleMembership - if(!is.null(annot)) mm <- rename_by2(mm, annot, rename) + for(i in 1:length(cons$stats)) { + mm <- cons$stats[[i]]$moduleMembership + if(!is.null(annot)) { + mm <- rename_by2(mm, annot, rename) + } gg <- rownames(mm) mm <- as.list(data.frame(mm)) if(!is.null(module)) mm <- mm[module] @@ -5228,19 +5384,24 @@ wgcna.getConsensusTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, nt topgenes <- mapply(intersect, topgenes, topgenesx[[k]], SIMPLIFY=FALSE) } topgenes <- lapply(topgenes, head, ntop) + + if(!is.null(module)) { + sel <- intersect(names(topgenes),module) + topgenes <- topgenes[sel] + } ## top genesets (as symbol!) topsets <- NULL - if("gsea" %in% names(wgcna)) { - ee <- wgcna$gsea + if("gsea" %in% names(cons)) { + ee <- cons$gsea ee <- ee[match(names(topgenes),names(ee))] - if(!is.null(module)) ee <- ee[module] + names(ee) <- names(topgenes) topsets <- lapply(ee,function(x) head(rownames(x),ntop)) } ## module traits - M <- lapply(wgcna$net$multiMEs, function(x) as.matrix(x$data)) - Y <- lapply(M, function(m) wgcna$datTraits[rownames(m),]) + M <- lapply(cons$net$multiMEs, function(x) as.matrix(x$data)) + Y <- lapply(M, function(m) cons$datTraits[rownames(m),]) R <- mapply( function(x,y) abs(cor(x,y,use="pairwise")), M, Y, SIMPLIFY=FALSE) R <- Reduce('+', R) toppheno <- apply(R, 1, function(x) names(which(x > 0.9*max(x,na.rm=TRUE))), @@ -5260,33 +5421,35 @@ wgcna.getConsensusTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, nt ##---------------------------------------------------------------------- #' @export -wgcna.describeModules <- function(wgcna, ntop=25, annot=NULL, multi=FALSE, +wgcna.describeModules <- function(wgcna, ntop=50, psig = 0.05, + annot=NULL, multi=FALSE, modules=NULL, experiment="", verbose=1, model=DEFAULT_LLM, docstyle = "detailed summary", numpar = 2, - level="gene", modules=NULL) { + level="gene") { + if(is.null(annot)) { + message("[wgcna.describeModules] WARNING. user annot table is recommended.") + } + if(multi) { - dbg("[wgcna.describeModules] calling getMultiTopGenesAndSets") top <- wgcna.getMultiTopGenesAndSets(wgcna, annot=annot, ntop=ntop, - level=level, rename="gene_title") + psig=psig, level=level, rename="gene_title") } else { - dbg("[wgcna.describeModules] calling getTopGenesAndSets") top <- wgcna.getTopGenesAndSets(wgcna, annot=annot, ntop=ntop, - level=level, rename="gene_title") + psig=psig, level=level, rename="gene_title") } - dbg("[wgcna.describeModules] names.top = ", names(top)) - - if(is.null(modules)) modules <- names(top$genes) - if(is.null(modules)) modules <- names(top$sets) - if(is.null(experiment)) experiment <- "" + if(is.null(modules)) { + modules <- union(names(top$genes), names(top$sets)) + } - if(!is.null(top$genes)) modules <- intersect(modules, names(top$genes)) - if(!is.null(top$sets)) modules <- intersect(modules, names(top$sets)) + if(is.null(experiment)) experiment <- "" + ##if(!is.null(top$genes)) modules <- intersect(modules, names(top$genes)) + ##if(!is.null(top$sets)) modules <- intersect(modules, names(top$sets)) ##modules <- intersect(modules, names(top$pheno)) if(length(modules)==0) { - dbg("[wgcna.describeModules] warning: empty module list!") + info("[wgcna.describeModules] warning: empty module list!") return(NULL) } @@ -5296,17 +5459,23 @@ wgcna.describeModules <- function(wgcna, ntop=25, annot=NULL, multi=FALSE, desc <- list() for(m in modules) { ss=gg=pp=NULL - gg <- paste( top$genes[[m]], collapse=', ') - ss <- paste( sub(".*:","",top$sets[[m]]), collapse='; ') - if(m %in% names(top$pheno)) pp <- paste( top$pheno[[m]], collapse='; ') - + + if(!is.null(top$genes[[m]])) { + gg <- paste( top$genes[[m]], collapse=', ') + } + if(!is.null(top$sets[[m]])) { + ss <- paste( sub(".*:","",top$sets[[m]]), collapse='; ') + } + if(m %in% names(top$pheno)) { + pp <- paste( top$pheno[[m]], collapse='; ') + } d <- "" - if(!is.null(pp)) d <- paste(d, "Correlated phenotypes: ", pp, "

") + if(!is.null(pp)) d <- paste(d, "\n\nCorrelated phenotypes: ", pp, "

") if(!is.null(gg) && gg!="") { - d <- paste(d, "Key genes: ", gg, "

") + d <- paste(d, "\n\nKey genes: ", gg, "

") } if(!is.null(ss) && ss!="") { - d <- paste(d, "Top enriched gene sets: ", ss, "

") + d <- paste(d, "\n\nTop enriched gene sets: ", ss, "

") } desc[[m]] <- d } @@ -5319,7 +5488,7 @@ wgcna.describeModules <- function(wgcna, ntop=25, annot=NULL, multi=FALSE, return(res) } - prompt <- paste("Give a",docstyle,"of the main overall biological function of the following top enriched genesets belonging to module . Discuss the possible relationship with phenotypes of this experiment about \"\". Use maximum",numpar,"paragraphs. Do not use any bullet points. \n\nHere is list of enriched gene sets: \n") + prompt <- paste("Give a",docstyle,"of the main overall biological function of the following top enriched genesets belonging to module . Discuss the possible relationship with phenotypes of this experiment about \"\". Use maximum",numpar,"paragraphs. Use prose, do not use any bullet points or tables. \n\nHere is list of enriched gene sets: \n") if(verbose>1) cat(prompt) @@ -5369,9 +5538,44 @@ wgcna.describeModules <- function(wgcna, ntop=25, annot=NULL, multi=FALSE, return(res) } +#' @export +wgcna.getTopModules <- function(wgcna, topratio=0.85, kx=4, rm.grey=TRUE, + multi=FALSE) { + + if(!multi) { + ww <- list(gx = wgcna) ## single-omics wgcna object + } else { + ww <- wgcna + } + + M <- list() + i=1 + for(i in 1:length(ww)) { + me <- ww[[i]]$net$MEs + dt <- ww[[i]]$datTraits + M[[i]] <- cor(me, dt, use="pairwise") + } + + top.modules <- c() + i=1 + for(i in 1:length(M)) { + mx <- rowMeans(abs(M[[i]]**kx),na.rm=TRUE)**(1/kx) + tt <- names(which( mx > topratio * max(mx))) + top.modules <- c(top.modules, tt) + } + + if(rm.grey) { + sel.grey <- grepl("[A-Z]{2}grey$",top.modules) + top.modules <- top.modules[!sel.grey] + } + top.modules +} + #' @export wgcna.create_report <- function(wgcna, ai_model, annot=NULL, multi=FALSE, - format="markdown", ntop=100, verbose=1) { + ntop=100, topratio=0.85, psig=0.05, + format="markdown", verbose=1, + progress=NULL) { if(length(ai_model)==1) ai_model <- rep(ai_model,3) if(!multi) { @@ -5381,27 +5585,27 @@ wgcna.create_report <- function(wgcna, ai_model, annot=NULL, multi=FALSE, } ## get top modules (most correlated with some phenotype) - M <- lapply(wgcnalist, function(w) as.matrix(w$modTraits)) - top.modules <- c() - for(i in 1:length(M)) { - mx <- sqrt(rowMeans(M[[i]]**2)) - tt <- names(which( mx > 0.8 * max(mx))) - top.modules <- c(top.modules, tt) - } + top.modules <- wgcna.getTopModules(wgcnalist, topratio=topratio, kx=4, + multi=TRUE) ## always multi format top.modules - - if(is.null(annot) && !is.null(wgcna$annot)) { - annot <- wgcna$annot + + if(is.null(annot) && !is.null(wgcnalist[[1]]$annot)) { + annot <- wgcnalist[[1]]$annot + } + if(is.null(annot)) { + message("[wgcna.create_report] WARNING. providing user annot table is recommended.") } - ## Describe modules with LLM. We can use one LLM model or more. - message("Extracting top modules...") + ## Step 1. Describe modules with LLM. We can use one LLM model or more. + if(!is.null(progress)) progress$set(message = "Extracting top modules...", value=0.2) + if(verbose) message("Extracting top modules...") out <- wgcna.describeModules( wgcnalist, modules = top.modules, - multi = TRUE, ## always true - ntop = ntop, - annot = annot, + multi = TRUE, ## always true (we use list) + ntop = ntop, ## number of top genes or sets + annot = annot, + psig = psig, experiment = wgcna$experiment, verbose = verbose, model = ai_model[[1]] @@ -5410,50 +5614,127 @@ wgcna.create_report <- function(wgcna, ai_model, annot=NULL, multi=FALSE, descriptions_prompts <- out$questions descriptions <- out$answers - ## Make consensus conclusion from the description summaries. + ## Step 2: Make consensus conclusion from the description summaries. summaries <- list() summaries_prompts <- list() - k=1 - for(k in names(descriptions)) { - message("Condensating module ",k,"...") - ss <- descriptions[[k]] - qq <- paste("Following are descriptions of a certain WGCNA module by one or more LLMs. Create a concise consensus conclusion out of the independent descriptions. Just answer, no confirmation, in one paragraph. \n\n", ss) - cc <- ai.ask(qq, model=ai_model[[2]]) - summaries[[k]] <- cc - summaries_prompts[[k]] <- qq - } - - ## Make detailed report. We concatenate all summaries and ask a - ## (better) LLM model to create a report. - message("Baking full report...") - all.summaries <- lapply(names(summaries), function(me) - paste0("================= ",me," =================\n\n", summaries[[me]],"\n")) - all.summaries <- paste(all.summaries, collapse="\n\n") - if(multi) { - qq <- "These are the results of a WGCNA multi-omics analysis. There are descriptions of the most relevant modules. Create a detailed report for this entire experiment. Give conclusions about the underlying biology by connecting multi-omics modules functionally and referring to key genes, proteins or metabolites. Suggest similarity to known diseases and possible therapies.\n\n" + results <- NULL + if(ai_model[[2]] != "") { + if(!is.null(progress)) progress$set(message = "Simmering modules...", value=0.3) + if(verbose) message("Simmering modules...") + k=1 + for(k in names(descriptions)) { + ss <- descriptions[[k]] + q2 <- paste("Following are descriptions of a certain WGCNA module by one or more LLMs. Create a consensus conclusion out of the independent descriptions. Describe the underlying biology, relate correlated phenotypes and mention key genes, proteins or metabolites. Just answer, no confirmation, use 1-2 paragraphs. Use prose as much as possible, do not use tables or bullet points.\n\n", ss) + cc <- ai.ask(q2, model=ai_model[[2]]) + summaries[[k]] <- cc + summaries_prompts[[k]] <- q2 + } + results <- summaries } else { - qq <- "These are the results of a WGCNA analysis. There are descriptions of the most relevant modules. Create a detailed report for this experiment. Give conclusion about the underlying biology by connecting modules functionally and referring to key genes, proteins or metabolites. Suggest similarity to known diseases and possible therapies.\n\n" + if(verbose) message("Skipping module summaries...") + results <- descriptions + } + all.results <- lapply(names(results), function(me) + paste0("================= ",me," =================\n\n", results[[me]],"\n")) + all.results <- paste(all.results, collapse="\n\n") + + + ## Step 3: Make detailed report. We concatenate all summaries and + ## ask a (better) LLM model to create a report. + if(!is.null(progress)) progress$set(message = "Baking full report...", value=0.6) + if(verbose) message("Baking full report...") + + q3 <- "These are the results of a WGCNA analysis. There are descriptions of the most relevant modules. Create a detailed report for this experiment. Give a detailed interpretation of the underlying biology by connecting modules into biological functional programs, referring to key genes, proteins or metabolites. Build an cross-module integrative narrative. Suggest similarity to known diseases and possible therapies. +\n\n +Format like a scientific article, use prose as much as possible, minimize the use of tables and bullet points. For long tables show at least the top 5, and at most top 10, up and down entries. Do not inject any inline code. Write a discussion and conclusion at the end of the report about the integrative biological narrative. Only write if there was evidence in the source text. Omit future directions." + + if(multi) { + q3 <- sub("WGCNA","multiomics WGCNA",q3) + q3 <- paste(q3, "Discuss advantages of using multi-omics.") } + xx <- wgcna$experiment pp <- paste("You are a biologist interpreting results from a WGCNA analysis for this experiment:", xx, ".\n\n") - qq <- paste(pp, qq, "Write in descriptive prose as much as possible. Only write if there was evidence in the source text. Omit future directions.") + q3 <- paste(pp, q3, "Only write if there was evidence in the source text. Omit future directions.") + q3 <- paste(q3, "Write in prose, no tables, no bullet points.") + if(format=="markdown") { - qq <- paste(qq, "Format response as markdown.") + q3 <- paste(q3, "Format response as markdown.") } if(tolower(format)=="html") { - qq <- paste(qq, "Format response as HTML.") + q3 <- paste(q3, "Format response as HTML.") } - qq <- paste(qq, "\n\n",all.summaries) - report <- ai.ask(qq, model = ai_model[[3]]) + q3 <- paste(q3, "\n\nnHere are the results: ",all.results,"\n") + + ## Finally ask LLM + report <- ai.ask(q3, model = ai_model[[3]]) report <- gsub("^```html|```$","",report) + + ## Step 4: Create diagram from report + if(!is.null(progress)) progress$set(message = "Mashing up diagram...", value=0.8) + if(verbose) message("Mashing up diagram...") + diagram <- wgcna.create_diagram(report, ai_model=ai_model[[3]]) list( descriptions_prompts = descriptions_prompts, descriptions = descriptions, summaries_prompts = summaries_prompts, summaries = summaries, - report_prompt = qq, - report = report + report_prompt = q3, + report = report, + diagram = diagram ) } + +#' @export +wgcna.create_diagram <- function(wgcna_report, ai_model, rankdir="LR", format="dot", + correct=TRUE, double.check=TRUE) { + + q4 <- paste0("Create a diagram connecting modules in the following WGCNA report. Annotate modules with main biological function and key features (gene, proteins or metabolites). Add phenotype nodes. Suggest cause and effect relations that explain the phenotypes. Group modules with same biological functions. Give just the code in DOT format, LR direction. Do not use any special characters, without headers or footer text. Do not use subgraphs. Do not use hexadecimal color coding. Use solid lines for positive regulation, use dashed lines for negative regulation. Color fill nodes matching the module names with light palette so we can still read well the text. Never use black for fill. Again, do not fill any nodes with black, use grey instead. Color phenotype nodes lightyellow.") + + if(grepl("mermaid",format,ignore.case=TRUE)) q4 <- sub("DOT","MERMAID",q4) + q4 <- paste(q4, "\n\n", wgcna_report, "") + aa <- ai.ask(q4, model = ai_model) + aa <- gsub("```","",aa) + + diagram <- gsub("mermaid\n|dot\n","",aa) + diagram <- gsub("&","and",diagram) + + is.dot <- grepl("dot",tolower(format)) + if(is.dot) { + if(rankdir=="TB") diagram <- sub("rankdir=LR","rankdir=TB",diagram) + if(rankdir=="LR") diagram <- sub("rankdir=TB","rankdir=LR",diagram) + } + + if(is.dot && correct) { + + ## force as digraph + diagram <- sub("^graph","digraph",diagram) + + ## crazy arrows + diagram <- gsub("-x->","->",diagram) + + ## remove anything after DOT last curly bracket + diagram <- sub("\\}\n.*","}\n",diagram) + diagram <- gsub("\\[solid\\]","[style=solid]",diagram) + diagram <- gsub("\\[dashed\\]","[style=dashed]",diagram) + + # match 3- or 6-digit hex color with replace with quoted version + diagram <- gsub("lightgreen","limegreen", diagram) ## avoid + diagram <- gsub("fillcolor=black","fillcolor=lightgrey", diagram) ## avoid + diagram <- gsub("#000000","#AAAAAA",diagram) ## no black + diagram <- gsub( + "(?