Merge pull request #36 from PF2-pasteur-fr/development

Development
PF2-pasteur-fr · Dec 11, 2017 · 73a5514 · 73a5514
2 parents 7f01d42 + 3ed9b05
commit 73a5514
Show file tree

Hide file tree

Showing 18 changed files with 82 additions and 71 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: SARTools
 Type: Package
 Title: Statistical Analysis of RNA-Seq Tools
-Version: 1.5.1
-Date: 2017-08-28
+Version: 1.6.0
+Date: 2017-12-11
 Author: Marie-Agnes Dillies and Hugo Varet
 Maintainer: Hugo Varet <[email protected]>
 Depends: R (>= 3.3.0), DESeq2 (>= 1.12.0), edgeR (>= 3.12.0), xtable
-Imports: stats, utils, graphics, grDevices, knitr, rmarkdown, SummarizedExperiment, S4Vectors, limma, genefilter (>= 1.44.0)
+Imports: stats, utils, graphics, grDevices, knitr, rmarkdown (>= 1.4), SummarizedExperiment, S4Vectors, limma, genefilter (>= 1.44.0)
 Suggests: optparse
 VignetteBuilder: knitr
 Encoding: latin1

diff --git a/NEWS b/NEWS
@@ -1,3 +1,11 @@
+CHANGES IN VERSION 1.6.0
+------------------------
+	o use "percentage" instead of "proportion" in some plots
+	o added pandoc-citeproc as potential requirement for some users
+	o added rmarkdown version 1.4 requirement to have knit_root_dir parameter in render()
+	o use file.path() instead of paste() when dealing with paths to import/create files
+	o added cpmCutoff parameter to writeReport.edgeR()
+
 CHANGES IN VERSION 1.5.1
 ------------------------
 	o fixed a bug in bullet points

diff --git a/R/MDSPlot.R b/R/MDSPlot.R
@@ -14,7 +14,7 @@
 MDSPlot <- function(dge, group, n=min(500,nrow(dge$counts)), gene.selection=c("pairwise", "common"),
                     col=c("lightblue","orange","MediumVioletRed","SpringGreen"), outfile=TRUE){
   if (outfile) png(filename="figures/MDS.png", width=1800, height=1800, res=300)
-    coord <- plotMDS(dge, top=n, method="logFC", gene.selection=gene.selection[1])
+    coord <- plotMDS(dge, top=n, method="logFC", gene.selection=gene.selection[1], plot=FALSE)
     abs=range(coord$x); abs=abs(abs[2]-abs[1])/25;
     ord=range(coord$y); ord=abs(ord[2]-ord[1])/25;
     plot(coord$x,coord$y, col=col[as.integer(group)], las=1, main="Multi-Dimensional Scaling plot",

diff --git a/R/barplotNull.R b/R/barplotNull.R
@@ -15,8 +15,8 @@ barplotNull <- function(counts, group, col=c("lightblue","orange","MediumVioletR
     percentage.allNull <- (nrow(counts) - nrow(removeNull(counts)))*100/nrow(counts)
     barplot(percentage, las = 2,
             col = col[as.integer(group)],
-		    ylab = "Proportion of null counts",
-		    main = "Proportion of null counts per sample", 
+		    ylab = "Percentage of null counts",
+		    main = "Percentage of null counts per sample", 
 	  	    ylim = c(0,1.2*ifelse(max(percentage)==0,1,max(percentage))))
     abline(h = percentage.allNull, lty = 2, lwd = 2)
     legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")

diff --git a/R/checkParameters.DESeq2.r b/R/checkParameters.DESeq2.r
@@ -29,75 +29,75 @@ checkParameters.DESeq2 <- function(projectName,author,targetFile,rawDir,
                                    typeTrans,locfunc,colors){
   problem <- FALSE
   if (!is.character(projectName) | length(projectName)!=1){
-    warning("projectName must be a character vector of length 1")
+    message("projectName must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(author) | length(author)!=1){
-    warning("author must be a character vector of length 1")
+    message("author must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){
-    warning("targetFile must be a character vector of length 1 specifying an accessible file")
+    message("targetFile must be a character vector of length 1 specifying an accessible file")
     problem <- TRUE
   }
   if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){
-    warning("rawDir must be a character vector of length 1 specifying an accessible directory")
+    message("rawDir must be a character vector of length 1 specifying an accessible directory")
     problem <- TRUE
   }
   if (!is.null(featuresToRemove) && !is.character(featuresToRemove)){
-    warning("featuresToRemove must be a character vector or equal to NULL")
+    message("featuresToRemove must be a character vector or equal to NULL")
     problem <- TRUE
   }
   if (!is.character(varInt) | length(varInt)!=1){
-    warning("varInt must be a character vector of length 1")
+    message("varInt must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(condRef) | length(condRef)!=1){
-    warning("condRef must be a character vector of length 1")
+    message("condRef must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){
-    warning("batch must be NULL or a character vector of length 1")
+    message("batch must be NULL or a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(fitType) | length(fitType)!=1 || !I(fitType %in% c("parametric","local"))){
-    warning("fitType must be equal to 'parametric' or 'local'")
+    message("fitType must be equal to 'parametric' or 'local'")
     problem <- TRUE
   }
   if (!is.logical(cooksCutoff) | length(cooksCutoff)!=1){
-    warning("cooksCutoff must be a boolean vector of length 1")
+    message("cooksCutoff must be a boolean vector of length 1")
     problem <- TRUE
   }
   if (!is.logical(independentFiltering) | length(independentFiltering)!=1){
-    warning("independentFiltering must be a boolean vector of length 1")
+    message("independentFiltering must be a boolean vector of length 1")
     problem <- TRUE
   }
   if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){
-    warning("alpha must be a numeric vector of length 1 with a value between 0 and 1")
+    message("alpha must be a numeric vector of length 1 with a value between 0 and 1")
     problem <- TRUE
   }
   if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){
-    warning(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
+    message(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
     problem <- TRUE
   }
   if (!is.character(typeTrans) | length(typeTrans)!=1 || !I(typeTrans %in% c("VST","rlog"))){
-    warning("typeTrans must be equal to 'VST' or 'rlog'")
+    message("typeTrans must be equal to 'VST' or 'rlog'")
     problem <- TRUE
   }
   if (!is.character(locfunc) | length(locfunc)!=1 || !I(locfunc %in% c("median","shorth"))){
-    warning("locfunc must be equal to 'median' or 'shorth'")
+    message("locfunc must be equal to 'median' or 'shorth'")
     problem <- TRUE
   } else{
     if (locfunc=="shorth" & !I("genefilter" %in% installed.packages()[,"Package"])){
-      warning("Package genefilter is needed if using locfunc='shorth'")
+      message("Package genefilter is needed if using locfunc='shorth'")
       problem <- TRUE
     }
   }
   areColors <- function(col){
     sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})})
   }
   if (!is.vector(colors) || !all(areColors(colors))){
-    warning("colors must be a vector of colors")
+    message("colors must be a vector of colors")
     problem <- TRUE
   }
 

diff --git a/R/checkParameters.edgeR.r b/R/checkParameters.edgeR.r
@@ -27,62 +27,62 @@ checkParameters.edgeR <- function(projectName,author,targetFile,rawDir,
                                   normalizationMethod,colors){
   problem <- FALSE
   if (!is.character(projectName) | length(projectName)!=1){
-    warning("projectName must be a character vector of length 1")
+    message("projectName must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(author) | length(author)!=1){
-    warning("author must be a character vector of length 1")
+    message("author must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){
-    warning("targetFile must be a character vector of length 1 specifying an accessible file")
+    message("targetFile must be a character vector of length 1 specifying an accessible file")
     problem <- TRUE
   }
   if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){
-    warning("rawDir must be a character vector of length 1 specifying an accessible directory")
+    message("rawDir must be a character vector of length 1 specifying an accessible directory")
     problem <- TRUE
   }  
   if (!is.null(featuresToRemove) && !is.character(featuresToRemove)){
-    warning("featuresToRemove must be a character vector or equal to NULL")
+    message("featuresToRemove must be a character vector or equal to NULL")
     problem <- TRUE
   }
   if (!is.character(varInt) | length(varInt)!=1){
-    warning("varInt must be a character vector of length 1")
+    message("varInt must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.character(condRef) | length(condRef)!=1){
-    warning("condRef must be a character vector of length 1")
+    message("condRef must be a character vector of length 1")
     problem <- TRUE
   }
   if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){
-    warning("batch must be NULL or a character vector of length 1")
+    message("batch must be NULL or a character vector of length 1")
     problem <- TRUE
   }
   if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){
-    warning("alpha must be a numeric vector of length 1 with a value between 0 and 1")
+    message("alpha must be a numeric vector of length 1 with a value between 0 and 1")
     problem <- TRUE
   }
   if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){
-    warning(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
+    message(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
     problem <- TRUE
   }
   if (!is.numeric(cpmCutoff) | length(cpmCutoff)!=1 || cpmCutoff<0){
-    warning("cpmCutoff must be a numeric vector of length 1 with a value equal to or greater than 0")
+    message("cpmCutoff must be a numeric vector of length 1 with a value equal to or greater than 0")
     problem <- TRUE
   }
   if (!is.character(normalizationMethod) | length(normalizationMethod)!=1 || !I(normalizationMethod %in% c("TMM","RLE","upperquartile"))){
-    warning("gene.selection must be equal to 'TMM', 'RLE' or 'upperquartile'")
+    message("gene.selection must be equal to 'TMM', 'RLE' or 'upperquartile'")
     problem <- TRUE
   }
   if (!is.character(gene.selection) | length(gene.selection)!=1 || !I(gene.selection %in% c("pairwise","common"))){
-    warning("gene.selection must be equal to 'pairwise' or 'common'")
+    message("gene.selection must be equal to 'pairwise' or 'common'")
     problem <- TRUE
   }
   areColors <- function(col){
     sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})})
   }
   if (!is.vector(colors) || !all(areColors(colors))){
-    warning("colors must be a vector of colors")
+    message("colors must be a vector of colors")
     problem <- TRUE
   }
 

diff --git a/R/loadCountData.R b/R/loadCountData.R
@@ -16,7 +16,7 @@ loadCountData <- function(target, rawDir="raw", skip=0, featuresToRemove=c("alig
   files <- as.character(target[,2])
 
   # detect if input count files are from featureCounts or HTSeq-count
-  f1 <- read.table(paste(rawDir,files[1],sep="/"), sep="\t", quote="\"", header=FALSE, nrows=1, stringsAsFactors=FALSE)
+  f1 <- read.table(file.path(rawDir, files[1]), sep="\t", quote="\"", header=FALSE, nrows=1, stringsAsFactors=FALSE)
   if (ncol(f1) >= 6 && all(apply(f1[1,1:6], 2, is.character))){
     # counter featurecounts
     idCol <- 1
@@ -33,15 +33,15 @@ loadCountData <- function(target, rawDir="raw", skip=0, featuresToRemove=c("alig
     }
   }
 
-  rawCounts <- read.table(paste(rawDir,files[1],sep="/"), sep="\t", quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
+  rawCounts <- read.table(file.path(rawDir, files[1]), sep="\t", quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
   rawCounts <- rawCounts[,c(idCol, countsCol)]
   colnames(rawCounts) <- c("Id", labels[1])
   if (any(duplicated(rawCounts$Id))) stop("Duplicated feature names in ", files[1])
   cat("Loading files:\n")
   cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="")
 
   for (i in 2:length(files)){
-    tmp <- read.table(paste(rawDir,files[i],sep="/"), sep="\t", header=header, skip=skip, stringsAsFactors=FALSE)
+    tmp <- read.table(file.path(rawDir, files[i]), sep="\t", header=header, skip=skip, stringsAsFactors=FALSE)
     tmp <- tmp[,c(idCol, countsCol)]
     colnames(tmp) <- c("Id", labels[i])
     if (any(duplicated(tmp$Id))) stop("Duplicated feature names in ", files[i])

diff --git a/R/majSequences.R b/R/majSequences.R
@@ -23,8 +23,8 @@ majSequences <- function(counts, n=3, group, col=c("lightblue","orange","MediumV
   if (outfile) png(filename="figures/majSeq.png",width=min(3600,1800+800*ncol(counts)/10),height=1800,res=300)
     maj <- apply(p, 2, max)
     seqname <- rownames(p)[apply(p, 2, which.max)]
-    x <- barplot(maj, col=col[as.integer(group)], main="Proportion of reads from most expressed sequence",
-	             ylim=c(0, max(maj)*1.2), las=2, ylab="Proportion of reads")
+    x <- barplot(maj, col=col[as.integer(group)], main="Percentage of reads from most expressed sequence",
+	             ylim=c(0, max(maj)*1.2), las=2, ylab="Percentage of reads")
     legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")
     for (i in 1:length(seqname)) text(x[i], maj[i]/2, seqname[i], cex=0.8, srt=90, adj=0)
   if (outfile) dev.off()

diff --git a/R/writeReport.DESeq2.r b/R/writeReport.DESeq2.r
@@ -40,8 +40,5 @@ writeReport.DESeq2 <- function(target, counts, out.DESeq2, summaryResults, majSe
                     run_pandoc=TRUE,
                     quiet=TRUE,
                     clean=TRUE)
-  # delete unwanted directory/file
-  # unlink("cache",force=TRUE,recursive=TRUE)
-  # unlink("report_DESeq2.md",force=TRUE)
   cat("HTML report created\n")
 }
diff --git a/R/writeReport.edgeR.r b/R/writeReport.edgeR.r
@@ -18,17 +18,18 @@
 #' @param batch variable to take as a batch effect
 #' @param alpha threshold of statistical significance
 #' @param pAdjustMethod p-value adjustment method: \code{"BH"} (default) or \code{"BY"}
+#' @param cpmCutoff counts-per-million cut-off to filter low counts
 #' @param colors vector of colors of each biological condition on the plots
 #' @param gene.selection selection of the features in \code{MDSPlot()} (\code{"pairwise"} by default)
 #' @param normalizationMethod normalization method: \code{"TMM"} (default), \code{"RLE"} (DESeq) or \code{"upperquartile"}
 #' @details This function generates the HTML report for a statistical analysis with edgeR. It uses the tables and graphs created during the workflow as well as the parameters defined at the beginning of the script.
 #' @author Hugo Varet
 
-writeReport.edgeR <- function(target,counts,out.edgeR,summaryResults,majSequences,
-                              workDir,projectName,author,targetFile,rawDir,
-                              featuresToRemove,varInt,condRef,batch,
-                              alpha,pAdjustMethod,colors,gene.selection,
-                              normalizationMethod){
+writeReport.edgeR <- function(target, counts, out.edgeR, summaryResults, majSequences,
+                              workDir, projectName, author, targetFile, rawDir,
+                              featuresToRemove, varInt, condRef, batch,
+                              alpha, pAdjustMethod, cpmCutoff, colors,
+                              gene.selection, normalizationMethod){
   rmarkdown::render(input=system.file("report_edgeR.rmd", package="SARTools"),
                     output_file=paste0(projectName, "_report.html"),
                     output_dir=workDir,
@@ -37,8 +38,5 @@ writeReport.edgeR <- function(target,counts,out.edgeR,summaryResults,majSequence
                     run_pandoc=TRUE,
                     quiet=TRUE,
                     clean=TRUE)
-  # delete unwanted directory/file
-  # unlink("cache",force=TRUE,recursive=TRUE)
-  # unlink(paste0("report_edgeR.md"),force=TRUE)
   cat("HTML report created\n")
 }
diff --git a/README.md b/README.md
@@ -15,7 +15,11 @@ In addition to the SARTools package itself, the workflow requires the installati
 To install the SARTools package from GitHub, open a R session and:
 - install DESeq2, edgeR and genefilter with `source("http://bioconductor.org/biocLite.R")` and `biocLite(c("DESeq2", "edgeR", "genefilter"))` (if not installed yet)
 - install devtools with `install.packages("devtools")` (if not installed yet)
-- Note: Ubuntu users may have to install some libraries (libxml2-dev, libcurl4-openssl-dev and libssl-dev) to be able to install DESeq2 and devtools
+- Notes:
+
+	- Ubuntu users may have to install some libraries (libxml2-dev, libcurl4-openssl-dev and libssl-dev) to be able to install DESeq2 and devtools
+	- Some users may have to install the pandoc and pandoc-citeproc libraries to be able to generate the final HTML reports
+
 - for Windows users only, install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) or check that it is already installed (needed to build the package)
 - load the devtools R package with `library(devtools)`
 - run `install_github("PF2-pasteur-fr/SARTools", build_vignettes=TRUE)`
@@ -34,6 +38,8 @@ How to use SARTools?
 
 A HTML vignette is available within the vignettes folder on GitHub and provides extensive information on the use of SARTools. The user can also open it with `vignette("SARTools")` if it has been generated during the installation of the package.
 
+Be careful to use the R script associated with the version of SARTools installed on your system.
+
 Please read the NEWS file to see the latest improvements!
 
 About SARTools

diff --git a/inst/report_DESeq2.rmd b/inst/report_DESeq2.rmd
@@ -58,10 +58,10 @@ Figure 1 shows the total number of mapped and counted reads for each sample. We
 
 </center>
 
-Figure 2 shows the proportion of features with no read count in each sample. We expect this proportion to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples are left in the data but are not taken into account for the analysis with DESeq2. Here, `r nbNull` features (`r round(100*nbNull/nrow(counts),2)`%) are in this situation (dashed line). Results for those features (fold-change and p-values) are set to NA in the results files.
+Figure 2 shows the percentage of features with no read count in each sample. We expect this percentage to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples are left in the data but are not taken into account for the analysis with DESeq2. Here, `r nbNull` features (`r round(100*nbNull/nrow(counts),2)`%) are in this situation (dashed line). Results for those features (fold-change and p-values) are set to NA in the results files.
 
 <center>
-![Figure 2: Proportion of features with null read counts in each sample.](figures/barplotNull.png){width=600}
+![Figure 2: Percentage of features with null read counts in each sample.](figures/barplotNull.png){width=600}
 
 </center>
 

diff --git a/inst/report_edgeR.rmd b/inst/report_edgeR.rmd
@@ -60,11 +60,11 @@ Figure 1 shows the total number of mapped and counted reads for each sample. We
 </center>
 
 
-Figure 2 shows the proportion of features with no read count in each sample. We expect this proportion to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples will not be taken into account for the analysis with edgeR. Here, `r nbNull` features (`r round(100*percentNull,2)`%) are in this situation (dashed line).
+Figure 2 shows the percentage of features with no read count in each sample. We expect this percentage to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples will not be taken into account for the analysis with edgeR. Here, `r nbNull` features (`r round(100*percentNull,2)`%) are in this situation (dashed line).
 
 
 <center>
-![Figure 2: Proportion of features with null read counts in each sample.](figures/barplotNull.png){width=600}
+![Figure 2: Percentage of features with null read counts in each sample.](figures/barplotNull.png){width=600}
 
 </center>