initialization

initializing the directory
qu4drupole · Nov 8, 2018 · 70bb7a2 · 70bb7a2
1 parent ff25752
commit 70bb7a2
Show file tree

Hide file tree

Showing 15 changed files with 708 additions and 0 deletions.
diff --git a/COMPACT_discretize.R b/COMPACT_discretize.R
@@ -0,0 +1,55 @@
+#Taken from "MD_COMPACT_build_array.R"
+#This is the sequence to restructure the gsets and create the discretized matrices
+
+library(limma)
+library(dplyr)
+library(GEOquery)
+
+#This will load all the functions
+#load("pre_built.RData")
+source("COMPACT_functions_v3.R")
+
+i <- 1
+Tlist = list()
+#Genes are sparsely identified in GSE33785
+for(f in list.files("Data/")){
+  load(paste("Data/",f,sep=""))
+  for(trmt in unique(gset$Treatment)){
+    #need to check if we need a 0 time or control ("ctrl") point
+    if(trmt == "ctrl"){
+      next()
+    }
+    else if(min(gset$Time[gset$Treatment == trmt]) == 0){
+      gse.tmp <- gset[,gset$Treatment == trmt]
+      Tlist[[i]] <- gse.tmp
+      #browser()
+      names(Tlist)[i] <- paste(gsub(".RData","",f),paste(trmt),sep="_")
+      i = i + 1
+    }
+    else if(any(gset$Time == 0)){
+      gse.tmp <- gset[,(gset$Treatment == trmt | gset$Treatment == "ctrl")]
+      gse.tmp$Treatment <- trmt
+      Tlist[[i]] <- gse.tmp
+      #browser()
+      names(Tlist)[i] <- paste(gsub(".RData","",f),paste(trmt),sep="_")
+      i = i + 1
+    }
+    #May need another 'else if' statement for experiments where the baseline is like 1 h...
+    else{
+      print("One or more GSEs are formatted incorrectly, or a baseline is needed")
+      break()
+    }
+  }
+}
+#Tlist <- Tlist[-c(10,11)]
+print("Finished treatment list")
+
+discreteList <- list()
+for(i in 1:length(Tlist)){
+  disc.res <- eDcollapse(Tlist[[i]])
+  disc.res <- eDcontrast(disc.res)
+  disc.res <- eDdiscretize(disc.res)
+  discreteList[[i]] <- disc.res
+}
+names(discreteList) <- names(Tlist)
+print("Finished making dicrete time")
diff --git a/COMPACT_enrichment_v3.R b/COMPACT_enrichment_v3.R
@@ -0,0 +1,116 @@
+#############################################
+# Marker enrichment analysis through combinatorial compact
+# Author: DGS
+# 
+# This all assumes that the data objects from "pre_built.RData" are in the environment 
+#
+# VERSION 3: major changes to the algorithm--importantly the master array is dropped
+#   1. COMPACT matrices will be calculated on the fly
+#   2. COMPACT time codes will be sorted differently (using Raj's method)
+#   3. Enrichment is scored as a p-value calculated from a hypergeometric distribution
+#
+# General procedure
+#   1. Read in the gene list
+#   2. Build COMPACT gene lists
+#   3. Get the gene list / COMPACT intersections
+#   4. calculate p-values
+#
+#############################################
+library(reshape2)
+library(ggplot2)
+library(gplots)
+
+#read in the gene/marker list to tested
+gene_list <- read.table(file = "HSC_test_list.txt", colClasses = "character")
+gene_list <- gene_list$V1
+
+
+#make empty results matrix
+enrichMatrix <- data.frame(matrix(0, nrow = length(Tlist), ncol = length(Tlist)), row.names = names(Tlist))
+colnames(enrichMatrix) <- names(Tlist)
+
+pairs_for_CB <- combn(length(discreteList),2)
+
+for(p in 1:ncol(pairs_for_CB)){
+  #browser()
+  ind <- pairs_for_CB[,p]
+  print(paste("working on pairings: ", ind[1], ind[2]))
+  #make 3D array of COMPACT and individual gene lists
+  patternLists <- getPatternGenes(ind[1], ind[2])
+  #make an empty 3D array to store temporary COMPACT and gene lists
+  #...getting the dimensions
+  maxA <- max(sapply(patternLists[["a"]],length, USE.NAMES = F))
+  maxB <- max(sapply(patternLists[["b"]],length, USE.NAMES = F))
+  maxGenes <- max(c(maxA,maxB))
+  COMPACTdims <- c(length(patternLists[["patterns"]]), maxGenes)
+  #...making the empty array
+  COMPACT <- array(NA, dim =  c(COMPACTdims[1], COMPACTdims[1], COMPACTdims[2]))
+  for(i in 1:nrow(COMPACT)){
+    for(j in 1:ncol(COMPACT)){
+      genes <- intersect(unique(patternLists[["a"]][[i]]), unique(patternLists[["b"]][[j]]))
+      if(length(genes) > 0){
+        COMPACT[i,j,1:length(genes)] <- genes
+      }
+    }
+  }
+  #make an empty matrix to store the Marker occurance in gene lists
+  occurance.matrix <- matrix(0, nrow = COMPACTdims[1], ncol = COMPACTdims[1])
+  for(i in 1:nrow(COMPACT)){
+    for(j in 1:ncol(COMPACT)){
+      occurance.matrix[i,j] <- sum(sapply(gene_list, function(x) sum(grepl(x, COMPACT[i,j,]))))
+    }
+  }
+  #flatten the COMPACT array
+  COMPACT.flat <- flattenCOMPACT()
+  #calculate enrichment p-value
+  pvalues <- hypergeometric.pvalue()
+  enrichMatrix[ind[1], ind[2]] <- pvalues[["m"]]
+  enrichMatrix[ind[2], ind[1]] <- pvalues[["n"]]
+}
+
+#make diagonal NAs, I'm concerned that the 0's will influence FDR adjustments
+ematrix <- as.matrix(enrichMatrix)
+ematrix[ematrix == 0] <- NA
+#get rid of negative p-values...also, why are there negative p-values??
+if(any(enrichMatrix < 0)){
+  ematrix[which(ematrix < 0)] <- NA
+}
+
+#Correcting for multiple comparisons
+#FDR by ematrix row
+enrich.adjusted.pvalue <- vector(mode="numeric")
+# for(trmt in 1:nrow(ematrix)){
+#   enrich.adjusted.pvalue <- c(enrich.adjusted.pvalue,unname(p.adjust(ematrix[trmt,], method="BH")))
+# }
+
+#FDR by whole matrix
+enrich.adjusted.pvalue <- p.adjust(ematrix, method="BH")
+enrich.adjusted.pvalue[is.na(enrich.adjusted.pvalue)] <- 1
+
+#The p-values seemed similar between by row and by whole matrix, so I'll continue with whole matrix bc it's less lines
+
+#putting the p-values in a matrix
+#this method of putting a vector into a matrix only works for square matrices, which I think we will always have
+#For FDR by row, the matrix needs to be transformed
+enrich.matrix.adjusted <- data.frame(matrix(enrich.adjusted.pvalue, nrow = length(Tlist), ncol = length(Tlist)), row.names = names(Tlist))
+colnames(enrich.matrix.adjusted) <- names(Tlist)
+
+#visualizing...
+#because there is such a huge range of values and we only care about the low values, the values above 5e-3 are forced to 5e-3
+ematrix <- as.matrix(enrich.matrix.adjusted)
+ematrix[ematrix > 5e-3] <- 5e-3
+
+heatmap.2(-log(t(ematrix)),
+          Colv = F,
+          dendrogram = "row",
+          #scale = "none",
+          trace = "none",
+          key = T,
+          cexCol = 1,
+          cexRow = 1,
+          margins = c(5,15))
+
+
+save(ematrix, file="HSC_enrichment_v3.RData")
+
+