diff --git a/DESCRIPTION b/DESCRIPTION index 277186d..8a7d82b 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: SARTools Type: Package Title: Statistical Analysis of RNA-Seq Tools -Version: 1.4.1 -Date: 2017-05-02 +Version: 1.5.0 +Date: 2017-08-10 Author: Marie-Agnes Dillies and Hugo Varet Maintainer: Hugo Varet Depends: R (>= 3.3.0), DESeq2 (>= 1.12.0), edgeR (>= 3.12.0), xtable -Imports: stats, utils, graphics, grDevices, knitr, SummarizedExperiment, S4Vectors, limma, genefilter (>= 1.44.0) +Imports: stats, utils, graphics, grDevices, knitr, rmarkdown, SummarizedExperiment, S4Vectors, limma, genefilter (>= 1.44.0) Suggests: optparse VignetteBuilder: knitr Encoding: latin1 diff --git a/NAMESPACE b/NAMESPACE index 4b6c045..a07303f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,8 +24,8 @@ importFrom(graphics,par) importFrom(graphics,plot) importFrom(graphics,points) importFrom(graphics,text) -importFrom(knitr,knit2html) importFrom(limma,plotMDS) +importFrom(rmarkdown,render) importFrom(stats,density) importFrom(stats,dist) importFrom(stats,dnorm) diff --git a/NEWS b/NEWS index 0cbef0a..2676cd8 100755 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +CHANGES IN VERSION 1.5.0 +------------------------ + o new HTML report (dynamic table of contents, automatic bibliography) + o simplified SERE calculations + o use of warning() instead of print() in checkParameters.edgeR() and checkParameters.DESeq2() + CHANGES IN VERSION 1.4.1 ------------------------ o SARTools now accepts count files generated by featureCounts (still one count file per sample) diff --git a/R/NAMESPACE.R b/R/NAMESPACE.R index cea1053..b5ab23b 100755 --- a/R/NAMESPACE.R +++ b/R/NAMESPACE.R @@ -6,7 +6,7 @@ #' @importFrom graphics abline barplot boxplot curve hist legend lines pairs par plot points text #' @importFrom stats density dist dnorm formula hclust lm model.matrix p.adjust.methods prcomp quantile relevel sd var #' @importFrom utils combn head installed.packages packageVersion read.table tail write.table -#' @importFrom knitr knit2html +#' @importFrom rmarkdown render #' @importFrom SummarizedExperiment assay colData #' @importFrom S4Vectors mcols metadata #' @importFrom limma plotMDS diff --git a/R/checkParameters.DESeq2.r b/R/checkParameters.DESeq2.r index 50e8ae3..e049c1d 100755 --- a/R/checkParameters.DESeq2.r +++ b/R/checkParameters.DESeq2.r @@ -25,80 +25,80 @@ checkParameters.DESeq2 <- function(projectName,author,targetFile,rawDir, featuresToRemove,varInt,condRef,batch,fitType, - cooksCutoff,independentFiltering,alpha,pAdjustMethod, - typeTrans,locfunc,colors){ + cooksCutoff,independentFiltering,alpha,pAdjustMethod, + typeTrans,locfunc,colors){ problem <- FALSE if (!is.character(projectName) | length(projectName)!=1){ - print("projectName must be a character vector of length 1") - problem <- TRUE + warning("projectName must be a character vector of length 1") + problem <- TRUE } if (!is.character(author) | length(author)!=1){ - print("author must be a character vector of length 1") - problem <- TRUE + warning("author must be a character vector of length 1") + problem <- TRUE } if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){ - print("targetFile must be a character vector of length 1 specifying an accessible file") - problem <- TRUE + warning("targetFile must be a character vector of length 1 specifying an accessible file") + problem <- TRUE } if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){ - print("rawDir must be a character vector of length 1 specifying an accessible directory") - problem <- TRUE + warning("rawDir must be a character vector of length 1 specifying an accessible directory") + problem <- TRUE } if (!is.null(featuresToRemove) && !is.character(featuresToRemove)){ - print("featuresToRemove must be a character vector or equal to NULL") - problem <- TRUE + warning("featuresToRemove must be a character vector or equal to NULL") + problem <- TRUE } if (!is.character(varInt) | length(varInt)!=1){ - print("varInt must be a character vector of length 1") - problem <- TRUE + warning("varInt must be a character vector of length 1") + problem <- TRUE } if (!is.character(condRef) | length(condRef)!=1){ - print("condRef must be a character vector of length 1") - problem <- TRUE + warning("condRef must be a character vector of length 1") + problem <- TRUE } if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){ - print("batch must be NULL or a character vector of length 1") - problem <- TRUE + warning("batch must be NULL or a character vector of length 1") + problem <- TRUE } if (!is.character(fitType) | length(fitType)!=1 || !I(fitType %in% c("parametric","local"))){ - print("fitType must be equal to 'parametric' or 'local'") - problem <- TRUE + warning("fitType must be equal to 'parametric' or 'local'") + problem <- TRUE } if (!is.logical(cooksCutoff) | length(cooksCutoff)!=1){ - print("cooksCutoff must be a boolean vector of length 1") - problem <- TRUE + warning("cooksCutoff must be a boolean vector of length 1") + problem <- TRUE } if (!is.logical(independentFiltering) | length(independentFiltering)!=1){ - print("independentFiltering must be a boolean vector of length 1") - problem <- TRUE + warning("independentFiltering must be a boolean vector of length 1") + problem <- TRUE } if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){ - print("alpha must be a numeric vector of length 1 with a value between 0 and 1") - problem <- TRUE + warning("alpha must be a numeric vector of length 1 with a value between 0 and 1") + problem <- TRUE } if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){ - print(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", "))) - problem <- TRUE + warning(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", "))) + problem <- TRUE } if (!is.character(typeTrans) | length(typeTrans)!=1 || !I(typeTrans %in% c("VST","rlog"))){ - print("typeTrans must be equal to 'VST' or 'rlog'") - problem <- TRUE + warning("typeTrans must be equal to 'VST' or 'rlog'") + problem <- TRUE } if (!is.character(locfunc) | length(locfunc)!=1 || !I(locfunc %in% c("median","shorth"))){ - print("locfunc must be equal to 'median' or 'shorth'") - problem <- TRUE + warning("locfunc must be equal to 'median' or 'shorth'") + problem <- TRUE } else{ if (locfunc=="shorth" & !I("genefilter" %in% installed.packages()[,"Package"])){ - print("Package genefilter is needed if using locfunc='shorth'") - problem <- TRUE - } + warning("Package genefilter is needed if using locfunc='shorth'") + problem <- TRUE + } } areColors <- function(col){ sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})}) } if (!is.vector(colors) || !all(areColors(colors))){ - print("colors must be a vector of colors") - problem <- TRUE + warning("colors must be a vector of colors") + problem <- TRUE } if (!problem){ diff --git a/R/checkParameters.edgeR.r b/R/checkParameters.edgeR.r index 4edd1d5..cbe10c6 100755 --- a/R/checkParameters.edgeR.r +++ b/R/checkParameters.edgeR.r @@ -23,67 +23,67 @@ checkParameters.edgeR <- function(projectName,author,targetFile,rawDir, featuresToRemove,varInt,condRef,batch,alpha, - pAdjustMethod,cpmCutoff,gene.selection, - normalizationMethod,colors){ + pAdjustMethod,cpmCutoff,gene.selection, + normalizationMethod,colors){ problem <- FALSE if (!is.character(projectName) | length(projectName)!=1){ - print("projectName must be a character vector of length 1") - problem <- TRUE + warning("projectName must be a character vector of length 1") + problem <- TRUE } if (!is.character(author) | length(author)!=1){ - print("author must be a character vector of length 1") - problem <- TRUE + warning("author must be a character vector of length 1") + problem <- TRUE } if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){ - print("targetFile must be a character vector of length 1 specifying an accessible file") - problem <- TRUE + warning("targetFile must be a character vector of length 1 specifying an accessible file") + problem <- TRUE } if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){ - print("rawDir must be a character vector of length 1 specifying an accessible directory") - problem <- TRUE + warning("rawDir must be a character vector of length 1 specifying an accessible directory") + problem <- TRUE } if (!is.null(featuresToRemove) && !is.character(featuresToRemove)){ - print("featuresToRemove must be a character vector or equal to NULL") - problem <- TRUE + warning("featuresToRemove must be a character vector or equal to NULL") + problem <- TRUE } if (!is.character(varInt) | length(varInt)!=1){ - print("varInt must be a character vector of length 1") - problem <- TRUE + warning("varInt must be a character vector of length 1") + problem <- TRUE } if (!is.character(condRef) | length(condRef)!=1){ - print("condRef must be a character vector of length 1") - problem <- TRUE + warning("condRef must be a character vector of length 1") + problem <- TRUE } if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){ - print("batch must be NULL or a character vector of length 1") - problem <- TRUE + warning("batch must be NULL or a character vector of length 1") + problem <- TRUE } if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){ - print("alpha must be a numeric vector of length 1 with a value between 0 and 1") - problem <- TRUE + warning("alpha must be a numeric vector of length 1 with a value between 0 and 1") + problem <- TRUE } if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){ - print(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", "))) - problem <- TRUE + warning(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", "))) + problem <- TRUE } if (!is.numeric(cpmCutoff) | length(cpmCutoff)!=1 || cpmCutoff<0){ - print("cpmCutoff must be a numeric vector of length 1 with a value equal to or greater than 0") - problem <- TRUE + warning("cpmCutoff must be a numeric vector of length 1 with a value equal to or greater than 0") + problem <- TRUE } if (!is.character(normalizationMethod) | length(normalizationMethod)!=1 || !I(normalizationMethod %in% c("TMM","RLE","upperquartile"))){ - print("gene.selection must be equal to 'TMM', 'RLE' or 'upperquartile'") - problem <- TRUE + warning("gene.selection must be equal to 'TMM', 'RLE' or 'upperquartile'") + problem <- TRUE } if (!is.character(gene.selection) | length(gene.selection)!=1 || !I(gene.selection %in% c("pairwise","common"))){ - print("gene.selection must be equal to 'pairwise' or 'common'") - problem <- TRUE + warning("gene.selection must be equal to 'pairwise' or 'common'") + problem <- TRUE } areColors <- function(col){ sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})}) } if (!is.vector(colors) || !all(areColors(colors))){ - print("colors must be a vector of colors") - problem <- TRUE + warning("colors must be a vector of colors") + problem <- TRUE } if (!problem){ diff --git a/R/descriptionPlots.r b/R/descriptionPlots.r index ec97bc2..017900b 100755 --- a/R/descriptionPlots.r +++ b/R/descriptionPlots.r @@ -27,11 +27,7 @@ descriptionPlots <- function(counts, group, col=c("lightblue","orange","MediumVi # SERE and pairwise scatter plots cat("Matrix of SERE statistics:\n") print(tabSERE(counts)) - if (ncol(counts)<=30){ - pairwiseScatterPlots(counts=counts, group=group) - } else{ - warning("No pairwise scatter-plot produced because of a too high number of samples (>30).") - } + pairwiseScatterPlots(counts=counts, group=group) return(majSequences) } diff --git a/R/pairwiseScatterPlots.R b/R/pairwiseScatterPlots.R index 825550e..bab5c93 100755 --- a/R/pairwiseScatterPlots.R +++ b/R/pairwiseScatterPlots.R @@ -10,17 +10,26 @@ pairwiseScatterPlots <- function(counts, group, outfile=TRUE){ ncol <- ncol(counts) - if (outfile) png(filename="figures/pairwiseScatter.png",width=700*ncol,height=700*ncol,res=300) + if (ncol <= 30){ + if (outfile) png(filename="figures/pairwiseScatter.png", width=700*ncol, height=700*ncol, res=300) # defining panel and lower.panel functions - panel <- function(x,y,...){points(x, y, pch=".");abline(a=0,b=1,lty=2);} - lower.panel <- function(x,y,...){ - horizontal <- (par("usr")[1] + par("usr")[2]) / 2; + panel <- function(x,y,...){points(x, y, pch=".");abline(a=0,b=1,lty=2);} + lower.panel <- function(x,y,...){ + horizontal <- (par("usr")[1] + par("usr")[2]) / 2; vertical <- (par("usr")[3] + par("usr")[4]) / 2; - text(horizontal, vertical, round(SERE(2^cbind(x,y) - 1), digits=2), cex=ncol/2.5) - } - # use of the paris function + text(horizontal, vertical, round(SERE(2^cbind(x,y) - 1), digits=2), cex=ncol/2.5) + } + # use of the paris function pairs(log2(counts+1), panel=panel, lower.panel=lower.panel, las=1, labels=paste(colnames(counts),group,sep="\n"), main="Pairwise scatter plot",cex.labels=ncol/2,cex.main=ncol/4) - if (outfile) dev.off() + if (outfile) dev.off() + } else{ + warning("No pairwise scatter-plot produced because of a too high number of samples (>30).") + if (outfile) png(filename="figures/pairwiseScatter.png", width=1900, height=1900, res=300) + par(mar=c(1.5, 1.5, 1.5, 1.5)) + plot(0, 0, bty="o", pch=".", col="white", xaxt="n", yaxt="n", xlab="", ylab="") + text(0, 0.3, "No pairwise scatter-plot produced because of\na too high number of samples (>30).", pos=1, cex=1.5) + if (outfile) dev.off() + } } diff --git a/R/tabSERE.R b/R/tabSERE.R index 0c846df..ea1ac72 100755 --- a/R/tabSERE.R +++ b/R/tabSERE.R @@ -7,10 +7,10 @@ #' @author Marie-Agnes Dillies and Hugo Varet tabSERE <- function(counts){ - sere <- matrix(NA, ncol=ncol(counts), nrow=ncol(counts)) - for (i in 1:ncol(counts)){ - for (j in 1:ncol(counts)){ - sere[i,j] <- SERE(counts[,c(i,j)]) + sere <- matrix(0, ncol=ncol(counts), nrow=ncol(counts)) + for (i in 1:(ncol(counts)-1)){ + for (j in (i+1):ncol(counts)){ + sere[i,j] <- sere[j,i] <- SERE(counts[,c(i,j)]) } } colnames(sere) <- rownames(sere) <- colnames(counts) diff --git a/R/writeReport.DESeq2.r b/R/writeReport.DESeq2.r index a04e9c3..737b0c2 100755 --- a/R/writeReport.DESeq2.r +++ b/R/writeReport.DESeq2.r @@ -29,14 +29,19 @@ writeReport.DESeq2 <- function(target, counts, out.DESeq2, summaryResults, majSequences, workDir, projectName, author, targetFile, rawDir, - featuresToRemove, varInt, condRef, batch, fitType, - cooksCutoff, independentFiltering, alpha, pAdjustMethod, - typeTrans, locfunc, colors){ - knit2html(input=system.file("report_DESeq2.rmd", package="SARTools"), - output=paste0(projectName, "_report.html"), - quiet=TRUE, title="Statistical report") + featuresToRemove, varInt, condRef, batch, fitType, + cooksCutoff, independentFiltering, alpha, pAdjustMethod, + typeTrans, locfunc, colors){ + rmarkdown::render(input=system.file("report_DESeq2.rmd", package="SARTools"), + output_file=paste0(projectName, "_report.html"), + output_dir=workDir, + intermediates_dir=workDir, + knit_root_dir=workDir, + run_pandoc=TRUE, + quiet=TRUE, + clean=TRUE) # delete unwanted directory/file - unlink("cache",force=TRUE,recursive=TRUE) - unlink("report_DESeq2.md",force=TRUE) + # unlink("cache",force=TRUE,recursive=TRUE) + # unlink("report_DESeq2.md",force=TRUE) cat("HTML report created\n") } diff --git a/R/writeReport.edgeR.r b/R/writeReport.edgeR.r index 5655c85..fac76d0 100755 --- a/R/writeReport.edgeR.r +++ b/R/writeReport.edgeR.r @@ -26,14 +26,19 @@ writeReport.edgeR <- function(target,counts,out.edgeR,summaryResults,majSequences, workDir,projectName,author,targetFile,rawDir, - featuresToRemove,varInt,condRef,batch, - alpha,pAdjustMethod,colors,gene.selection, - normalizationMethod){ - knit2html(input=system.file("report_edgeR.rmd", package="SARTools"), - output=paste0(projectName, "_report.html"), - quiet=TRUE, title="Statistical report") + featuresToRemove,varInt,condRef,batch, + alpha,pAdjustMethod,colors,gene.selection, + normalizationMethod){ + rmarkdown::render(input=system.file("report_edgeR.rmd", package="SARTools"), + output_file=paste0(projectName, "_report.html"), + output_dir=workDir, + intermediates_dir=workDir, + knit_root_dir=workDir, + run_pandoc=TRUE, + quiet=TRUE, + clean=TRUE) # delete unwanted directory/file - unlink("cache",force=TRUE,recursive=TRUE) - unlink(paste0("report_edgeR.md"),force=TRUE) + # unlink("cache",force=TRUE,recursive=TRUE) + # unlink(paste0("report_edgeR.md"),force=TRUE) cat("HTML report created\n") } diff --git a/inst/bibliography.bib b/inst/bibliography.bib new file mode 100755 index 0000000..39b29ff --- /dev/null +++ b/inst/bibliography.bib @@ -0,0 +1,218 @@ +@Manual{rcoreteam, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2017}, + url = {https://www.R-project.org/}, + } + +@Article{Schulze2012, +author="Schulze, Stefan K. +and Kanwar, Rahul +and G{\"o}lzenleuchter, Meike +and Therneau, Terry M. +and Beutler, Andreas S.", +title="SERE: Single-parameter quality control and sample comparison for RNA-Seq", +journal="BMC Genomics", +year="2012", +volume="13", +number="1", +pages="524", +issn="1471-2164", +doi="10.1186/1471-2164-13-524", +url="http://dx.doi.org/10.1186/1471-2164-13-524" +} + +@Article{Gentleman2004, +author="Gentleman, Robert C. +and Carey, Vincent J. +and Bates, Douglas M. +and Bolstad, Ben +and Dettling, Marcel +and Dudoit, Sandrine +and Ellis, Byron +and Gautier, Laurent +and Ge, Yongchao +and Gentry, Jeff +and Hornik, Kurt +and Hothorn, Torsten +and Huber, Wolfgang +and Iacus, Stefano +and Irizarry, Rafael +and Leisch, Friedrich +and Li, Cheng +and Maechler, Martin +and Rossini, Anthony J. +and Sawitzki, Gunther +and Smith, Colin +and Smyth, Gordon +and Tierney, Luke +and Yang, Jean YH +and Zhang, Jianhua", +title="Bioconductor: open software development for computational biology and bioinformatics", +journal="Genome Biology", +year="2004", +volume="5", +number="10", +pages="R80", +issn="1474-760X", +doi="10.1186/gb-2004-5-10-r80", +url="http://dx.doi.org/10.1186/gb-2004-5-10-r80" +} + +@Article{Anders2010, +author="Anders, Simon and Huber, Wolfgang", +title="Differential expression analysis for sequence count data", +journal="Genome Biology", +year="2010", +volume="11", +number="10", +pages="R106", +issn="1474-760X", +doi="10.1186/gb-2010-11-10-r106", +url="http://dx.doi.org/10.1186/gb-2010-11-10-r106" +} + +@Article{Love2014, +author="Love, Michael I. and Huber, Wolfgang and Anders, Simon", +title="Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2", +journal="Genome Biology", +year="2014", +volume="15", +number="12", +pages="550", +issn="1474-760X", +doi="10.1186/s13059-014-0550-8", +url="http://dx.doi.org/10.1186/s13059-014-0550-8" +} + +@article{cook1977, + ISSN = {00401706}, + URL = {http://www.jstor.org/stable/1268249}, + author = {R. Dennis Cook}, + journal = {Technometrics}, + number = {1}, + pages = {15-18}, + publisher = {Taylor & Francis, Ltd., American Statistical Association, American Society for Quality}, + title = {Detection of Influential Observation in Linear Regression}, + volume = {19}, + year = {1977} +} + +@article{cox1897, + ISSN = {00359246}, + URL = {http://www.jstor.org/stable/2345476}, + author = {D. R. Cox and N. Reid}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {1-39}, + publisher = {Royal Statistical Society, Wiley}, + title = {Parameter Orthogonality and Approximate Conditional Inference}, + volume = {49}, + year = {1987} +} + +@article{BH1995, + ISSN = {00359246}, + URL = {http://www.jstor.org/stable/2346101}, + author = {Yoav Benjamini and Yosef Hochberg}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {289-300}, + publisher = {Royal Statistical Society, Wiley}, + title = {Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing}, + volume = {57}, + year = {1995} +} + +@article{BY2001, + ISSN = {00905364}, + URL = {http://www.jstor.org/stable/2674075}, + author = {Yoav Benjamini and Daniel Yekutieli}, + journal = {The Annals of Statistics}, + number = {4}, + pages = {1165-1188}, + publisher = {Institute of Mathematical Statistics}, + title = {The Control of the False Discovery Rate in Multiple Testing under Dependency}, + volume = {29}, + year = {2001} +} + +@article{robinson2010, +author = {Robinson, Mark and McCarthy, Davis and Smyth, Gordon}, +title = {edgeR: a Bioconductor package for differential expression analysis of digital gene expression data}, +journal = {Bioinformatics}, +volume = {26}, +number = {1}, +pages = {139}, +year = {2010}, +doi = {10.1093/bioinformatics/btp616}, +URL = {http://dx.doi.org/10.1093/bioinformatics/btp616}, +eprint = {/oup/backfile/content_public/journal/bioinformatics/26/1/10.1093/bioinformatics/btp616/2/btp616.pdf} +} + +@article{dillies2013, +author = {Dillies, Marie-Agnès and Rau, Andrea and Aubert, Julie and Hennequet-Antier, Christelle and Jeanmougin, Marine and Servant, Nicolas and Keime, Céline and Marot, Guillemette and Castel, David and Estelle, Jordi and Guernec, Gregory and Jagla, Bernd and Jouneau, Luc and Laloë, Denis and Le Gall, Caroline and Schaëffer, Brigitte and Le Crom, Stéphane and Guedj, Mickaël and Jaffrézic, Florence}, +title = {A comprehensive evaluation of normalization methods for Illumina high-throughput RNA sequencing data analysis}, +journal = {Briefings in Bioinformatics}, +volume = {14}, +number = {6}, +pages = {671}, +year = {2013}, +doi = {10.1093/bib/bbs046}, +URL = {http://dx.doi.org/10.1093/bib/bbs046}, +eprint = {/oup/backfile/content_public/journal/bib/14/6/10.1093/bib/bbs046/2/bbs046.pdf} +} + +@article{robinson2008, +author = {Robinson, Mark and Smyth, Gordon}, +title = {Small-sample estimation of negative binomial dispersion, with applications to SAGE data}, +journal = {Biostatistics}, +volume = {9}, +number = {2}, +pages = {321}, +year = {2008}, +doi = {10.1093/biostatistics/kxm030}, +URL = {http://dx.doi.org/10.1093/biostatistics/kxm030}, +eprint = {/oup/backfile/content_public/journal/biostatistics/9/2/10.1093/biostatistics/kxm030/2/kxm030.pdf} +} + +@article{robinson2007, +author = {Robinson, Mark and Smyth, Gordon}, +title = {Moderated statistical tests for assessing differences in tag abundance}, +journal = {Bioinformatics}, +volume = {23}, +number = {21}, +pages = {2881}, +year = {2007}, +doi = {10.1093/bioinformatics/btm453}, +URL = {http://dx.doi.org/10.1093/bioinformatics/btm453}, +eprint = {/oup/backfile/content_public/journal/bioinformatics/23/21/10.1093_bioinformatics_btm453/2/btm453.pdf} +} + +@article{wu2013, +author = {Wu, Hao and Wang, Chi and Wu, Zhijin}, +title = {A new shrinkage estimator for dispersion improves differential expression detection in RNA-seq data}, +journal = {Biostatistics}, +volume = {14}, +number = {2}, +pages = {232}, +year = {2013}, +doi = {10.1093/biostatistics/kxs033}, +URL = {http://dx.doi.org/10.1093/biostatistics/kxs033}, +eprint = {/oup/backfile/content_public/journal/biostatistics/14/2/10.1093/biostatistics/kxs033/2/kxs033.pdf} +} + +@article{mccarthy2012, +author = {McCarthy, Davis and Chen, Yunshun and Smyth, Gordon}, +title = {Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation}, +journal = {Nucleic Acids Research}, +volume = {40}, +number = {10}, +pages = {4288}, +year = {2012}, +doi = {10.1093/nar/gks042}, +URL = {http://dx.doi.org/10.1093/nar/gks042}, +eprint = {/oup/backfile/content_public/journal/nar/40/10/10.1093_nar_gks042/2/gks042.pdf} +} diff --git a/inst/medecine-sciences.csl b/inst/medecine-sciences.csl new file mode 100644 index 0000000..715fc1d --- /dev/null +++ b/inst/medecine-sciences.csl @@ -0,0 +1,130 @@ + + diff --git a/inst/report_DESeq2.rmd b/inst/report_DESeq2.rmd index f4e5427..72ca217 100755 --- a/inst/report_DESeq2.rmd +++ b/inst/report_DESeq2.rmd @@ -1,56 +1,43 @@ -#
Statistical report of project `r projectName`:
-#
pairwise comparison(s) of conditions
-#
with DESeq2
- --------------------------------------------------------------------------------------------------------------------------- - -Author: `r author` - -Date: `r Sys.Date()` +--- +title: '`r paste0("Statistical report of project ", projectName, ": pairwise comparison(s) of conditions with DESeq2")`' +author: '`r author`' +date: '`r Sys.Date()`' +output: + html_document: + toc: TRUE + toc_float: TRUE + number_sections: TRUE +bibliography: bibliography.bib +csl: medecine-sciences.csl +--- The SARTools R package which generated this report has been developped at PF2 - Institut Pasteur by M.-A. Dillies and H. Varet (hugo.varet@pasteur.fr). Thanks to cite H. Varet, L. Brillet-Guéguen, J.-Y. Coppee and M.-A. Dillies, _SARTools: A DESeq2- and EdgeR-Based R Pipeline for Comprehensive Differential Analysis of RNA-Seq Data_, PLoS One, 2016, doi: http://dx.doi.org/10.1371/journal.pone.0157022 when using this tool for any analysis published. --------------------------------------------------------------------------------------------------------------------------- - -## Table of contents - -1. Introduction -2. Description of raw data -3. Variability within the experiment: data exploration -4. Normalization -5. Differential analysis -6. R session information and parameters -7. Bibliography - --------------------------------------------------------------------------------------------------------------------------- - -## 1 Introduction +# Introduction The analyses reported in this document are part of the `r projectName` project. The aim is to find features that are differentially expressed between `r paste(paste(levels(target[,varInt])[-nlevels(target[,varInt])],collapse=", "),levels(target[,varInt])[nlevels(target[,varInt])],sep=" and ")`. The statistical analysis process includes data normalization, graphical exploration of raw and normalized data, test for differential expression for each feature between the conditions, raw p-value adjustment and export of lists of features having a significant differential expression between the conditions. `r ifelse(!is.null(batch),paste0("In this analysis, the ",batch, " effect will be taken into account in the statistical models."),"")` -The analysis is performed using the R software [R Core Team, 2014], Bioconductor [Gentleman, 2004] packages including DESeq2 [Anders, 2010 and Love, 2014] and the SARTools package developed at PF2 - Institut Pasteur. Normalization and differential analysis are carried out according to the DESeq2 model and package. This report comes with additional tab-delimited text files that contain lists of differentially expressed features. - -For more details about the DESeq2 methodology, please refer to its related publications [Anders, 2010 and Love, 2014]. +The analysis is performed using the R software [@rcoreteam], Bioconductor [@Gentleman2004] packages including DESeq2 [@Anders2010; @Love2014] and the SARTools package developed at PF2 - Institut Pasteur. Normalization and differential analysis are carried out according to the DESeq2 model and package. This report comes with additional tab-delimited text files that contain lists of differentially expressed features. --------------------------------------------------------------------------------------------------------------------------- +For more details about the DESeq2 methodology, please refer to its related publications [@Anders2010; @Love2014]. -## 2 Description of raw data +# Description of raw data The count data files and associated biological conditions are listed in the following table. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(target,caption="Table 1: Data files and associated biological conditions."), type="html", include.rownames=FALSE, html.table.attributes = "align='center'") ``` After loading the data we first have a look at the raw data table itself. The data table contains one row per annotated feature and one column per sequenced sample. Row names of this table are feature IDs (unique identifiers). The table contains raw count values representing the number of reads that map onto the features. For this project, there are `r nrow(counts)` features in the count data table. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(head(counts),caption="Table 2: Partial view of the count data table.",digits=0), type="html", html.table.attributes = "align='center'") ``` Looking at the summary of the count table provides a basic description of these raw counts (min and max values, median, etc). -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} fun_summary=function(x){ out=c(quantile(x,c(0,0.25,0.5),type=1),mean(x),quantile(x,c(0.75,1),type=1)) names(out)=c("Min.","1st Qu.","Median","Mean","3rd Qu.","Max.") @@ -61,170 +48,151 @@ nbNull <- nrow(counts) - nrow(removeNull(counts)) # needed in one of the next pa ``` Figure 1 shows the total number of mapped and counted reads for each sample. We expect total read counts to be similar within conditions, they may be different across conditions. Total counts sometimes vary widely between replicates. This may happen for several reasons, including: + - different rRNA contamination levels between samples (even between biological replicates); - slight differences between library concentrations, since they may be difficult to measure with high precision. -
+
- Barplot total counts -
Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.
+![Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.](figures/barplotTotal.png){width=600} +
-
Figure 2 shows the proportion of features with no read count in each sample. We expect this proportion to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples are left in the data but are not taken into account for the analysis with DESeq2. Here, `r nbNull` features (`r round(100*nbNull/nrow(counts),2)`%) are in this situation (dashed line). Results for those features (fold-change and p-values) are set to NA in the results files. -
- Barplot null counts -
Figure 2: Proportion of features with null read counts in each sample.
+![Figure 2: Proportion of features with null read counts in each sample.](figures/barplotNull.png){width=600} +
-
Figure 3 shows the distribution of read counts for each sample. For sake of readability, $\text{log}_2(\text{counts}+1)$ are used instead of raw counts. Again we expect replicates to have similar distributions. In addition, this figure shows if read counts are preferably low, medium or high. This depends on the organisms as well as the biological conditions under consideration. -
- Estimated densities of raw counts -
Figure 3: Density distribution of read counts.
+![Figure 3: Density distribution of read counts.](figures/densplot.png){width=600} +
-
It may happen that one or a few features capture a high proportion of reads (up to 20% or more). This phenomenon should not influence the normalization process. The DESeq2 normalization has proved to be robust to this situation [Dillies, 2012]. Anyway, we expect these high count features to be the same across replicates. They are not necessarily the same across conditions. Figure 4 and table 4 illustrate the possible presence of such high count features in the data set. -
- Most represented sequences -
Figure 4: Percentage of reads associated with the sequence having the highest count (provided in each box on the graph) for each sample.
+![Figure 4: Percentage of reads associated with the sequence having the highest count (provided in each box on the graph) for each sample.](figures/majSeq.png){width=600} +
-
-```{r , cache=TRUE, echo=FALSE, results="asis"} -print(xtable(majSequences,caption="Table 4: Percentage of reads associated with the sequences having the highest counts."), type="html", html.table.attributes = "align='center'") + +```{r , echo=FALSE, results="asis"} +print(xtable(as.matrix(majSequences),caption="Table 4: Percentage of reads associated with the sequences having the highest counts."), type="html", html.table.attributes = "align='center'") ``` -We may wish to assess the similarity between samples across conditions. A pairwise scatter plot is produced (figure 5) to show how replicates and samples from different biological conditions are similar or different ($\text{log}_2(\text{counts}+1)$ are used instead of raw count values). Moreover, as the Pearson correlation has been shown not to be relevant to measure the similarity between replicates, the SERE statistic has been proposed as a similarity index between RNA-Seq samples [Schulze, 2012]. It measures whether the variability between samples is random Poisson variability or higher. Pairwise SERE values are printed in the lower triangle of the pairwise scatter plot. The value of the SERE statistic is: +We may wish to assess the similarity between samples across conditions. A pairwise scatter plot is produced (figure 5) to show how replicates and samples from different biological conditions are similar or different ($\text{log}_2(\text{counts}+1)$ are used instead of raw count values). Moreover, as the Pearson correlation has been shown not to be relevant to measure the similarity between replicates, the SERE statistic has been proposed as a similarity index between RNA-Seq samples [@Schulze2012]. It measures whether the variability between samples is random Poisson variability or higher. Pairwise SERE values are printed in the lower triangle of the pairwise scatter plot. The value of the SERE statistic is: + - 0 when samples are identical (no variability at all: this may happen in the case of a sample duplication); + - 1 for technical replicates (technical variability follows a Poisson distribution); + - greater than 1 for biological replicates and samples from different biological conditions (biological variability is higher than technical one, data are over-dispersed with respect to Poisson). The higher the SERE value, the lower the similarity. It is expected to be lower between biological replicates than between samples of different biological conditions. Hence, the SERE statistic can be used to detect inversions between samples. -
-
- Pairwise scatter plot (not produced when more than 30 samples) -
Figure 5: Pairwise comparison of samples.
-
-
+
+![Figure 5: Pairwise comparison of samples (not produced when more than 30 samples).](figures/pairwiseScatter.png) --------------------------------------------------------------------------------------------------------------------------- +
-## 3 Variability within the experiment: data exploration +# Variability within the experiment: data exploration -The main variability within the experiment is expected to come from biological differences between the samples. This can be checked in two ways. The first one is to perform a hierarchical clustering of the whole sample set. This is performed after a transformation of the count data which can be either a Variance Stabilizing Transformation (VST) or a regularized log transformation (rlog) [Anders, 2010 and Love, 2014]. +The main variability within the experiment is expected to come from biological differences between the samples. This can be checked in two ways. The first one is to perform a hierarchical clustering of the whole sample set. This is performed after a transformation of the count data which can be either a Variance Stabilizing Transformation (VST) or a regularized log transformation (rlog) [@Anders2010; @Love2014]. A VST is a transformation of the data that makes them homoscedastic, meaning that the variance is then independent of the mean. It is performed in two steps: (i) a mean-variance relationship is estimated from the data with the same function that is used to normalize count data and (ii) from this relationship, a transformation of the data is performed in order to get a dataset in which the variance is independent of the mean. The homoscedasticity is a prerequisite for the use of some data analysis methods, such as hierarchical clustering or Principal Component Analysis (PCA). The regularized log transformation is based on a GLM (Generalized Linear Model) on the counts and has the same goal as a VST but is more robust in the case when the size factors vary widely. Figure 6 shows the dendrogram obtained from `r typeTrans`-transformed data. An euclidean distance is computed between samples, and the dendrogram is built upon the Ward criterion. We expect this dendrogram to group replicates and separate biological conditions. -
- Clustering -
Figure 6: Sample clustering based on normalized data.
+![Figure 6: Sample clustering based on normalized data.](figures/cluster.png){width=600} +
-
Another way of visualizing the experiment variability is to look at the first principal components of the PCA, as shown on the figure 7. On this figure, the first principal component (PC1) is expected to separate samples from the different biological conditions, meaning that the biological variability is the main source of variance in the data. -
+
- Principal component analysis -
Figure 7: First two components of a Principal Component Analysis, with percentages of variance associated with each axis.
+![Figure 7: First two components of a Principal Component Analysis, with percentages of variance associated with each axis.](figures/PCA.png){width=1200} +
-
-```{r , cache=TRUE, echo=FALSE, results="asis"} + +```{r , echo=FALSE, results="asis"} if (!is.null(batch)){ cat("For the statistical analysis, we need to take into account the effect of the ",batch," parameter. Statistical models and tests will thus be adjusted on it.\n") } ``` --------------------------------------------------------------------------------------------------------------------------- - -## 4 Normalization +# Normalization Normalization aims at correcting systematic technical biases in the data, in order to make read counts comparable across samples. The normalization proposed by DESeq2 relies on the hypothesis that most features are not differentially expressed. It computes a scaling factor for each sample. Normalized read counts are obtained by dividing raw read counts by the scaling factor associated with the sample they belong to. Scaling factors around 1 mean (almost) no normalization is performed. Scaling factors lower than 1 will produce normalized counts higher than raw ones, and the other way around. Two options are available to compute scaling factors: locfunc="median" (default) or locfunc="shorth". Here, the normalization was performed with locfunc="`r locfunc`". -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(t(matrix(out.DESeq2$sf, dimnames=list(target$label,"Size factor"))),caption="Table 5: Normalization factors."), type="html", html.table.attributes = "align='center'") ``` The histograms (figure 8) can help to validate the choice of the normalization parameter ("median" or "shorth"). Under the hypothesis that most features are not differentially expressed, each size factor represented by a red line is expected to be close to the mode of the distribution of the counts divided by their geometric means across samples. -
- Diagnostic of size factors -
Figure 8: Diagnostic of the estimation of the size factors.
+![Figure 8: Diagnostic of the estimation of the size factors.](figures/diagSizeFactorsHist.png) +
-
The figure 9 shows that the scaling factors of DESeq2 and the total count normalization factors may not perform similarly. -
- Size factors vs total counts -
Figure 9: Plot of the estimated size factors and the total number of reads per sample.
+![Figure 9: Plot of the estimated size factors and the total number of reads per sample.](figures/diagSizeFactorsTC.png){width=600} +
-
Boxplots are often used as a qualitative measure of the quality of the normalization process, as they show how distributions are globally affected during this process. We expect normalization to stabilize distributions across samples. Figure 10 shows boxplots of raw (left) and normalized (right) data respectively. -
- Boxplots of raw and normalized counts -
Figure 10: Boxplots of raw (left) and normalized (right) read counts.
-
-
+![Figure 10: Boxplots of raw (left) and normalized (right) read counts.](figures/countsBoxplots.png){width=1200} --------------------------------------------------------------------------------------------------------------------------- + -## 5 Differential analysis +# Differential analysis -### 5.1 Modelisation +## Modelisation DESeq2 aims at fitting one linear model per feature. For this project, the design used is counts `r paste(as.character(design(out.DESeq2$dds)),collapse=" ")` and the goal is to estimate the models' coefficients which can be interpreted as $\log_2(\texttt{FC})$. These coefficients will then be tested to get p-values and adjusted p-values. -### 5.2 Outlier detection +## Outlier detection + +Model outliers are features for which at least one sample seems unrelated to the experimental or study design. For every feature and for every sample, the Cook's distance [@cook1977] reflects how the sample matches the model. A large value of the Cook's distance indicates an outlier count and p-values are not computed for the corresponding feature. `r ifelse(!cooksCutoff,"For this project, the detection of model outliers have been turned off by setting the cut-off to the infinite.","")` -Model outliers are features for which at least one sample seems unrelated to the experimental or study design. For every feature and for every sample, the Cook's distance [Cook, 1977] reflects how the sample matches the model. A large value of the Cook's distance indicates an outlier count and p-values are not computed for the corresponding feature. `r ifelse(!cooksCutoff,"For this project, the detection of model outliers have been turned off by setting the cut-off to the infinite.","")` +## Dispersions estimation -### 5.3 Dispersions estimation +The DESeq2 model assumes that the count data follow a negative binomial distribution which is a robust alternative to the Poisson law when data are over-dispersed (the variance is higher than the mean). The first step of the statistical procedure is to estimate the dispersion of the data. Its purpose is to determine the shape of the mean-variance relationship. The default is to apply a GLM (Generalized Linear Model) based method (fitType="parametric"), which can handle complex designs but may not converge in some cases. The alternative is to use fitType="local" as described in the original paper [@Anders2010]. The parameter used for this project is fitType="`r fitType`". Then, DESeq2 imposes a Cox Reid-adjusted profile likelihood maximization [@cox1897 and McCarthy, 2012] and uses the maximum _a posteriori_ (MAP) of the dispersion [Wu, 2013]. -The DESeq2 model assumes that the count data follow a negative binomial distribution which is a robust alternative to the Poisson law when data are over-dispersed (the variance is higher than the mean). The first step of the statistical procedure is to estimate the dispersion of the data. Its purpose is to determine the shape of the mean-variance relationship. The default is to apply a GLM (Generalized Linear Model) based method (fitType="parametric"), which can handle complex designs but may not converge in some cases. The alternative is to use fitType="local" as described in the original paper [Anders, 2010]. The parameter used for this project is fitType="`r fitType`". Then, DESeq2 imposes a Cox Reid-adjusted profile likelihood maximization [Cox, 1987 and McCarthy, 2012] and uses the maximum _a posteriori_ (MAP) of the dispersion [Wu, 2013]. -
- Dispersions estimations -
Figure 11: Dispersion estimates (left) and diagnostic of log-normality (right).
+![Figure 11: Dispersion estimates (left) and diagnostic of log-normality (right).](figures/dispersionsPlot.png){width=1200} +
-
The left panel on figure 11 shows the result of the dispersion estimation step. The x- and y-axes represent the mean count value and the estimated dispersion respectively. Black dots represent empirical dispersion estimates for each feature (from the observed counts). The red dots show the mean-variance relationship function (fitted dispersion value) as estimated by the model. The blue dots are the final estimates from the maximum _a posteriori_ and are used to perform the statistical test. Blue circles (if any) point out dispersion outliers. These are features with a very high empirical variance (computed from observed counts). These high dispersion values fall far from the model estimation. For these features, the statistical test is based on the empirical variance in order to be more conservative than with the MAP dispersion. These features will have low chance to be declared significant. The figure on the right panel allows to check the hypothesis of log-normality of the dispersions. -### 5.4 Statistical test for differential expression +## Statistical test for differential expression Once the dispersion estimation and the model fitting have been done, DESeq2 can perform the statistical testing. Figure 12 shows the distributions of raw p-values computed by the statistical test for the comparison(s) done. This distribution is expected to be a mixture of a uniform distribution on $[0,1]$ and a peak around 0 corresponding to the differentially expressed features. -
+
- Histogram(s) of raw p-values -
Figure 12: Distribution(s) of raw p-values.
+![Figure 12: Distribution(s) of raw p-values.](figures/rawpHist.png) +
-
-### 5.5 Independent filtering -DESeq2 can perform an independent filtering to increase the detection power of differentially expressed features at the same experiment-wide type I error. Since features with very low counts are not likely to see significant differences typically due to high dispersion, it defines a threshold on the mean of the normalized counts irrespective of the biological condition. This procedure is independent because the information about the variables in the design formula is not used [Love, 2014]. +## Independent filtering + +DESeq2 can perform an independent filtering to increase the detection power of differentially expressed features at the same experiment-wide type I error. Since features with very low counts are not likely to see significant differences typically due to high dispersion, it defines a threshold on the mean of the normalized counts irrespective of the biological condition. This procedure is independent because the information about the variables in the design formula is not used [@Love2014]. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} if (independentFiltering){ cat("Table 6 reports the thresholds used for each comparison and the number of features discarded by the independent filtering. Adjusted p-values of discarded features are then set to NA.\n") print(xtable(summaryResults$tabIndepFiltering,caption="Table 6: Number of features discarded by the independent filtering for each comparison."),type="html",include.rownames=FALSE, html.table.attributes = "align='center'") @@ -233,38 +201,40 @@ if (independentFiltering){ } ``` -### 5.6 Final results +## Final results -A p-value adjustment is performed to take into account multiple testing and control the false positive rate to a chosen level $\alpha$. For this analysis, a `r pAdjustMethod` p-value adjustment was performed [Benjamini, 1995 and 2001] and the level of controlled false positive rate was set to `r alpha`. +A p-value adjustment is performed to take into account multiple testing and control the false positive rate to a chosen level $\alpha$. For this analysis, a `r pAdjustMethod` p-value adjustment was performed [@BH1995 and BY2001] and the level of controlled false positive rate was set to `r alpha`. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(summaryResults$nDiffTotal,caption=paste0(ifelse(independentFiltering,"Table 7: ","Table 6: "),"Number of up-, down- and total number of differentially expressed features for each comparison.")),type="html",include.rownames=FALSE, html.table.attributes = "align='center'") ``` Figure 13 represents the MA-plot of the data for the comparisons done, where differentially expressed features are highlighted in red. A MA-plot represents the log ratio of differential expression as a function of the mean intensity for each feature. Triangles correspond to features having a too low/high $\log_2(\text{FC})$ to be displayed on the plot. -
+
- MA-plot(s) -
Figure 13: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.
+![Figure 13: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.](figures/MAPlot.png) +
-
+ Figure 14 shows the volcano plots for the comparisons performed and differentially expressed features are still highlighted in red. A volcano plot represents the log of the adjusted P value as a function of the log ratio of differential expression. -
+
- Volcano plot(s) -
Figure 14: Volcano plot(s) of each comparison. Red dots represent significantly differentially expressed features.
+![Figure 14: Volcano plot(s) of each comparison. Red dots represent significantly differentially expressed features.](figures/volcanoPlot.png) +
-
+ Full results as well as lists of differentially expressed features are provided in the following text files which can be easily read in a spreadsheet. For each comparison: + - TestVsRef.complete.txt contains results for all the features; - TestVsRef.up.txt contains results for significantly up-regulated features. Features are ordered from the most significant adjusted p-value to the less significant one; - TestVsRef.down.txt contains results for significantly down-regulated features. Features are ordered from the most significant adjusted p-value to the less significant one. These files contain the following columns: + - Id: unique feature identifier; - sampleName: raw counts per sample; - norm.sampleName: rounded normalized counts per sample; @@ -284,13 +254,11 @@ These files contain the following columns: - betaConv: convergence of the coefficients of the model (TRUE or FALSE); - maxCooks: maximum Cook's distance of the feature. --------------------------------------------------------------------------------------------------------------------------- - -## 6 R session information and parameters +# R session information and parameters The versions of the R software and Bioconductor packages used for this analysis are listed below. It is important to save them if one wants to re-perform the analysis in the same conditions. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} si <- as.character(toLatex(sessionInfo())) si <- si[-c(1,length(si))] si <- gsub("(\\\\verb)|(\\|)", "", si) @@ -320,19 +288,4 @@ Parameter values used for this analysis are: - locfunc: `r locfunc` - colors: `r colors` --------------------------------------------------------------------------------------------------------------------------- - -## 7 Bibliography - -- R Core Team, **R: A Language and Environment for Statistical Computing**, _R Foundation for Statistical Computing_, 2014 -- Gentleman, Carey, Bates et al, **Bioconductor: Open software development for computational biology and bioinformatics**, _Genome Biology_, 2004 -- Anders and Huber, **Differential expression analysis for sequence count data**, _Genome Biology_, 2010 -- Love, Huber and Anders, **Moderated estimation of fold change and dispersion for RNA-Seq data with DESeq2**, _Genome Biology_, 2014 -- Dillies, Rau, Aubert et al, **A comprehensive evaluation of normalization methods for Illumina RNA-seq data analysis**, _Briefings in Bioinformatics_, 2012 -- Schulze, Kanwar, Golzenleuchter et al, **SERE: Single-parameter quality control and sample comparison for RNA-Seq**, _BMC Genomics_, 2012 -- Cook, **Detection of Influential Observation in Linear Regression**, _Technometrics_, 1977 -- Cox and Reid, **Parameter orthogonality and approximate conditional inference**, _Journal of the Royal Statistical Society_, 1987 -- McCarthy, Chen and Smyth, **Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation**, _Nucleic Acids Research_, 2012 -- Wu, Wang and Wu, **A new shrinkage estimator for dispersion improves differential expression detection in RNA-seq data**, _Biostatistics_, 2013 -- Benjamini and Hochberg, **Controlling the False Discovery Rate : A Practical and Powerful Approach to Multiple Testing**, _Journal of the Royal Statistical Society_, 1995 -- Benjamini and Yekutieli, **The control of the false discovery rate in multiple testing under dependency**, _The Annals of Statistics_, 2001 +# Bibliography diff --git a/inst/report_edgeR.rmd b/inst/report_edgeR.rmd index 04a6dbd..3a4c95b 100755 --- a/inst/report_edgeR.rmd +++ b/inst/report_edgeR.rmd @@ -1,57 +1,43 @@ -#
Statistical report of project `r projectName`:
-#
pairwise comparison(s) of conditions
-#
with edgeR
- --------------------------------------------------------------------------------------------------------------------------- - -Author: `r author` - -Date: `r Sys.Date()` +--- +title: '`r paste0("Statistical report of project ", projectName, ": pairwise comparison(s) of conditions with edgeR")`' +author: '`r author`' +date: '`r Sys.Date()`' +output: + html_document: + toc: TRUE + toc_float: TRUE + number_sections: TRUE +bibliography: bibliography.bib +csl: medecine-sciences.csl +--- The SARTools R package which generated this report has been developped at PF2 - Institut Pasteur by M.-A. Dillies and H. Varet (hugo.varet@pasteur.fr). Thanks to cite H. Varet, L. Brillet-Guéguen, J.-Y. Coppee and M.-A. Dillies, _SARTools: A DESeq2- and EdgeR-Based R Pipeline for Comprehensive Differential Analysis of RNA-Seq Data_, PLoS One, 2016, doi: http://dx.doi.org/10.1371/journal.pone.0157022 when using this tool for any analysis published. --------------------------------------------------------------------------------------------------------------------------- - -## Table of contents - -1. Introduction -2. Description of raw data -3. Filtering low counts -4. Variability within the experiment: data exploration -5. Normalization -6. Differential analysis -7. R session information and parameters -8. Bibliography - --------------------------------------------------------------------------------------------------------------------------- - -## 1 Introduction +# Introduction The analyses reported in this document are part of the `r projectName` project. The aim is to find features that are differentially expressed between `r paste(paste(levels(target[,varInt])[-nlevels(target[,varInt])],collapse=", "),levels(target[,varInt])[nlevels(target[,varInt])],sep=" and ")`. The statistical analysis process includes data normalization, graphical exploration of raw and normalized data, test for differential expression for each feature between the conditions, raw p-value adjustment and export of lists of features having a significant differential expression between the conditions. `r ifelse(!is.null(batch),paste0("In this analysis, the ",batch, " effect will be taken into account in the statistical models."),"")` -The analysis is performed using the R software [R Core Team, 2014], Bioconductor [Gentleman, 2004] packages including edgeR [Robinson, 2010] and the SARTools package developed at PF2 - Institut Pasteur. Normalization and differential analysis are carried out according to the edgeR model and package. This report comes with additional tab-delimited text files that contain lists of differentially expressed features. +The analysis is performed using the R software [@rcoreteam], Bioconductor [@Gentleman2004] packages including edgeR [@robinson2010] and the SARTools package developed at PF2 - Institut Pasteur. Normalization and differential analysis are carried out according to the edgeR model and package. This report comes with additional tab-delimited text files that contain lists of differentially expressed features. -For more details about the edgeR methodology, please refer to its related publications [Robinson, 2007, 2008, 2010 and McCarthy, 2012]. +For more details about the edgeR methodology, please refer to its related publications [@robinson2007; @robinson2008; @robinson2010; @mccarthy2012]. --------------------------------------------------------------------------------------------------------------------------- - -## 2 Description of raw data +# Description of raw data The count data files and associated biological conditions are listed in the following table. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(target,caption="Table 1: Data files and associated biological conditions."), type="html", include.rownames=FALSE, html.table.attributes = "align='center'") ``` After loading the data we first have a look at the raw data table itself. The data table contains one row per annotated feature and one column per sequenced sample. Row names of this table are feature IDs (unique identifiers). The table contains raw count values representing the number of reads that map onto the features. For this project, there are `r nrow(counts)` features in the count data table. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(head(counts),caption="Table 2: Partial view of the count data table.",digits=0), type="html", html.table.attributes = "align='center'") ``` Looking at the summary of the count table provides a basic description of these raw counts (min and max values, median, etc). -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} fun_summary=function(x){ out=c(quantile(x,c(0,0.25,0.5),type=1),mean(x),quantile(x,c(0.75,1),type=1)) names(out)=c("Min.","1st Qu.","Median","Mean","3rd Qu.","Max.") @@ -63,179 +49,176 @@ percentNull <- nbNull/nrow(counts) ``` Figure 1 shows the total number of mapped and counted reads for each sample. We expect total read counts to be similar within conditions, they may be different across conditions. Total counts sometimes vary widely between replicates. This may happen for several reasons, including: + - different rRNA contamination levels between samples (even between biological replicates); - slight differences between library concentrations, since they may be difficult to measure with high precision. -
+
- Barplot total counts -
Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.
+![Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.](figures/barplotTotal.png){width=600} +
-
+ Figure 2 shows the proportion of features with no read count in each sample. We expect this proportion to be similar within conditions. Features with null read counts in the `r ncol(counts)` samples will not be taken into account for the analysis with edgeR. Here, `r nbNull` features (`r round(100*percentNull,2)`%) are in this situation (dashed line). -
+
- Barplot null counts -
Figure 2: Proportion of features with null read counts in each sample.
+![Figure 2: Proportion of features with null read counts in each sample.](figures/barplotNull.png){width=600} +
-
+ Figure 3 shows the distribution of read counts for each sample. For sake of readability, $\text{log}_2(\text{counts}+1)$ are used instead of raw counts. Again we expect replicates to have similar distributions. In addition, this figure shows if read counts are preferably low, medium or high. This depends on the organisms as well as the biological conditions under consideration. -
+
- Estimated densities of raw counts -
Figure 3: Density distribution of read counts.
+![Figure 3: Density distribution of read counts.](figures/densplot.png){width=600} +
-
-It may happen that one or a few features capture a high proportion of reads (up to 20% or more). This phenomenon should not influence the normalization process. The edgeR normalization has proved to be robust to this situation [Dillies, 2012]. Anyway, we expect these high count features to be the same across replicates. They are not necessarily the same across conditions. Figure 4 and table 4 illustrate the possible presence of such high count features in the data set. -
+It may happen that one or a few features capture a high proportion of reads (up to 20% or more). This phenomenon should not influence the normalization process. The edgeR normalization has proved to be robust to this situation [@dillies2013]. Anyway, we expect these high count features to be the same across replicates. They are not necessarily the same across conditions. Figure 4 and table 4 illustrate the possible presence of such high count features in the data set. + +
- Most represented sequences -
Figure 4: Percentage of reads associated with the sequence having the highest count (provided in each box on the graph) for each sample.
+![Figure 4: Percentage of reads associated with the sequence having the highest count (provided in each box on the graph) for each sample.](figures/majSeq.png){width=600} +
-
-```{r , cache=TRUE, echo=FALSE, results="asis"} -print(xtable(majSequences,caption="Table 4: Percentage of reads associated with the sequences having the highest counts."), type="html", html.table.attributes = "align='center'") + +```{r , echo=FALSE, results="asis"} +print(xtable(as.matrix(majSequences),caption="Table 4: Percentage of reads associated with the sequences having the highest counts."), type="html", html.table.attributes = "align='center'") ``` -We may wish to assess the similarity between samples across conditions. A pairwise scatter plot is produced (figure 5) to show how replicates and samples from different biological conditions are similar or different ($\text{log}_2(\text{counts}+1)$ are used instead of raw count values). Moreover, as the Pearson correlation has been shown not to be relevant to measure the similarity between replicates, the SERE statistic has been proposed as a similarity index between RNA-Seq samples [Schulze, 2012]. It measures whether the variability between samples is random Poisson variability or higher. Pairwise SERE values are printed in the lower triangle of the pairwise scatter plot. The value of the SERE statistic is: +We may wish to assess the similarity between samples across conditions. A pairwise scatter plot is produced (figure 5) to show how replicates and samples from different biological conditions are similar or different ($\text{log}_2(\text{counts}+1)$ are used instead of raw count values). Moreover, as the Pearson correlation has been shown not to be relevant to measure the similarity between replicates, the SERE statistic has been proposed as a similarity index between RNA-Seq samples [@Schulze2012]. It measures whether the variability between samples is random Poisson variability or higher. Pairwise SERE values are printed in the lower triangle of the pairwise scatter plot. The value of the SERE statistic is: + - 0 when samples are identical (no variability at all: this may happen in the case of a sample duplication); + - 1 for technical replicates (technical variability follows a Poisson distribution); + - greater than 1 for biological replicates and samples from different biological conditions (biological variability is higher than technical one, data are over-dispersed with respect to Poisson). The higher the SERE value, the lower the similarity. It is expected to be lower between biological replicates than between samples of different biological conditions. Hence, the SERE statistic can be used to detect inversions between samples. -
+
- Pairwise scatter plot (not produced when more than 30 samples) -
Figure 5: Pairwise comparison of samples.
-
-
+![Figure 5: Pairwise comparison of samples (not produced when more than 30 samples).](figures/pairwiseScatter.png) + --------------------------------------------------------------------------------------------------------------------------- -## 3 Filtering low counts +# Filtering low counts edgeR suggests to filter features with null or low counts because they do not supply much information. For this project, `r nrow(counts) - nrow(out.edgeR$dge$counts)` features (`r round(100*(nrow(counts)-nrow(out.edgeR$dge$counts))/nrow(counts),2)`%) have been removed from the analysis because they did not satisfy the following condition: having at least `r cpmCutoff` counts-per-million in at least `r min(table(target[,varInt]))` samples. --------------------------------------------------------------------------------------------------------------------------- - -## 4 Variability within the experiment: data exploration +# Variability within the experiment: data exploration The main variability within the experiment is expected to come from biological differences between the samples. This can be checked in two ways. The first one is to perform a hierarchical clustering of the whole sample set. This is performed after a transformation of the count data as moderated log-counts-per-million. Figure 6 shows the dendrogram obtained from CPM data. An euclidean distance is computed between samples, and the dendrogram is built upon the Ward criterion. We expect this dendrogram to group replicates and separate biological conditions. -
+
- Clustering -
Figure 6: Sample clustering based on normalized data.
+![Figure 6: Sample clustering based on normalized data.](figures/cluster.png){width=600} +
-
+ Another way of visualizing the experiment variability is to look at the first two dimensions of a multidimensional scaling plot, as shown on figure 7. On this figure, the first dimension is expected to separate samples from the different biological conditions, meaning that the biological variability is the main source of variance in the data. -
+
- Multidimensional scaling plot -
Figure 7: Multidimensional scaling plot of the samples.
+![Figure 7: Multidimensional scaling plot of the samples.](figures/MDS.png){width=600} +
-
-```{r , cache=TRUE, echo=FALSE, results="asis"} + +```{r , echo=FALSE, results="asis"} if (!is.null(batch)){ cat("For the statistical analysis, we need to take into account the effect of the ",batch," parameter. Statistical models and tests will thus be adjusted on it.\n") } ``` --------------------------------------------------------------------------------------------------------------------------- - -## 5 Normalization +# Normalization Normalization aims at correcting systematic technical biases in the data, in order to make read counts comparable across samples. The normalization proposed by edgeR is called Trimmed Mean of M-values (TMM) but it is also possible to use the RLE (DESeq) or upperquartile normalizations. It relies on the hypothesis that most features are not differentially expressed. edgeR computes a factor for each sample. These normalization factors apply to the total number of counts and cannot be used to normalize read counts in a direct manner. Indeed, normalization factors are used to normalize total counts. These in turn are used to normalize read counts according to a total count normalization: if $N_j$ is the total number of reads of the sample $j$ and $f_j$ its normalization factor, $N'_j=f_j \times N_j$ is the normalized total number of reads. Then, let $s_j=N'_j/\bar{N'}$ with $\bar{N'}$ the mean of the $N'_j$ s. Finally, the normalized counts of the sample $j$ are defined as $x'_{ij}=x_{ij}/s_j$ where $i$ is the gene index. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(t(matrix(out.edgeR$dge$samples$norm.factors, dimnames=list(target$label,paste0(normalizationMethod," normalization factors")))),caption="Table 5: Normalization factors."), type="html", html.table.attributes = "align='center'") ``` Boxplots are often used to assess the quality of the normalization process, as they show how distributions are globally affected during this process. We expect normalization to stabilize distributions across samples. Figure 8 shows boxplots of raw (left) and normalized (right) data respectively. -
+
- Boxplots of raw and normalized counts -
Figure 8: Boxplots of raw (left) and normalized (right) read counts.
+![Figure 8: Boxplots of raw (left) and normalized (right) read counts.](figures/countsBoxplots.png){width=1200} +
-
--------------------------------------------------------------------------------------------------------------------------- -## 6 Differential analysis +# Differential analysis -### 6.1 Modelization +## Modelization edgeR aims at fitting one linear model per feature. For this project, the design used is `r paste(as.character(paste("~", ifelse(!is.null(batch), paste(batch,"+"), ""), varInt)),collapse=" ")` and the goal is to estimate the models' coefficients which can be interpreted as $\log_2(\texttt{FC})$. These coefficients will then be tested to get p-values and adjusted p-values. -### 6.2 Dispersions estimation +## Dispersions estimation The edgeR model assumes that the count data follow a negative binomial distribution which is a robust alternative to the Poisson law when data are over-dispersed (the variance is higher than the mean). The first step of the statistical procedure is to estimate the dispersion of the data. -
+
- Dispersions estimations -
Figure 9: Dispersion estimates.
+![Figure 9: Dispersion estimates.](figures/BCV.png){width=600} +
-
+ Figure 9 shows the result of the dispersion estimation step. The x- and y-axes represent the mean count value and the estimated dispersion respectively. Black dots represent empirical dispersion estimates for each feature (from the observed count values). The blue curve shows the relationship between the means of the counts and the dispersions modeled with splines. The red segment represents the common dispersion. -### 6.3 Statistical test for differential expression +## Statistical test for differential expression Once the dispersion estimation and the model fitting have been done, edgeR can perform the statistical testing. Figure 10 shows the distributions of raw p-values computed by the statistical test for the comparison(s) done. This distribution is expected to be a mixture of a uniform distribution on $[0,1]$ and a peak around 0 corresponding to the differentially expressed features. -
+
- Histogram(s) of raw p-values -
Figure 10: Distribution(s) of raw p-values.
+![Figure 10: Distribution(s) of raw p-values.](figures/rawpHist.png) +
-
-### 6.4 Final results -A p-value adjustment is performed to take into account multiple testing and control the false positive rate to a chosen level $\alpha$. For this analysis, a `r pAdjustMethod` p-value adjustment was performed [Benjamini, 1995 and 2001] and the level of controlled false positive rate was set to `r alpha`. +## Final results + +A p-value adjustment is performed to take into account multiple testing and control the false positive rate to a chosen level $\alpha$. For this analysis, a `r pAdjustMethod` p-value adjustment was performed [@BH1995; @BY2001] and the level of controlled false positive rate was set to `r alpha`. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} print(xtable(summaryResults$nDiffTotal,caption="Table 6: Number of up-, down- and total number of differentially expressed features for each comparison."),type="html",include.rownames=FALSE, html.table.attributes = "align='center'") ``` Figure 11 represents the MA-plot of the data for the comparisons done, where differentially expressed features are highlighted in red. A MA-plot represents the log ratio of differential expression as a function of the mean intensity for each feature. Triangles correspond to features having a too low/high $\log_2(\text{FC})$ to be displayed on the plot. -
+
- MA-plot(s) -
Figure 11: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.
+![Figure 11: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.](figures/MAPlot.png) +
-
+ Figure 12 shows the volcano plots for the comparisons performed and differentially expressed features are still highlighted in red. A volcano plot represents the log of the adjusted P value as a function of the log ratio of differential expression. -
+
- Volcano plot(s) -
Figure 12: Volcano plot(s) of each comparison. Red dots represent significantly differentially expressed features.
+![Figure 12: Volcano plot(s) of each comparison. Red dots represent significantly differentially expressed features.](figures/volcanoPlot.png) +
-
+ Full results as well as lists of differentially expressed features are provided in the following text files which can be easily read in a spreadsheet. For each comparison: + - TestVsRef.complete.txt contains results for all the features; - TestVsRef.up.txt contains results for up-regulated features. Features are ordered from the most significant adjusted p-value to the less significant one; - TestVsRef.down.txt contains results for down-regulated features. Features are ordered from the most significant adjusted p-value to the less significant one. These files contain the following columns: + - Id: unique feature identifier; - sampleName: raw counts per sample; - norm.sampleName: rounded normalized counts per sample; @@ -251,13 +234,11 @@ These files contain the following columns: - tagwise.dispersion: dispersion parameter estimated from feature counts (i.e. black dots on figure 9); - trended.dispersion: dispersion parameter estimated with splines (i.e. blue curve on figure 9). --------------------------------------------------------------------------------------------------------------------------- - -## 7 R session information and parameters +# R session information and parameters The versions of the R software and Bioconductor packages used for this analysis are listed below. It is important to save them if one wants to re-perform the analysis in the same conditions. -```{r , cache=TRUE, echo=FALSE, results="asis"} +```{r , echo=FALSE, results="asis"} si <- as.character(toLatex(sessionInfo())) si <- si[-c(1,length(si))] si <- gsub("(\\\\verb)|(\\|)", "", si) @@ -285,18 +266,4 @@ Parameter values used for this analysis are: - normalizationMethod: `r normalizationMethod` - colors: `r colors` --------------------------------------------------------------------------------------------------------------------------- - -## 8 Bibliography - -- R Core Team, **R: A Language and Environment for Statistical Computing**, _R Foundation for Statistical Computing_, 2014 -- Gentleman, Carey, Bates et al, **Bioconductor: Open software development for computational biology and bioinformatics**, _Genome Biology_, 2004 -- Robinson and Smyth, **Moderated statistical tests for assessing differences in tag abundance**, _Bioinformatics_, 2007 -- Robinson and Smyth, **Small-sample estimation of negative binomial dispersion, with applications to SAGE data**, _Biostatistics_, 2008 -- Robinson, McCarthy and Smyth, **edgeR: a Bioconductor package for differential expression analysis of digital gene expression data**, _Bioinformatics_, 2010 -- Dillies, Rau, Aubert et al, **A comprehensive evaluation of normalization methods for Illumina RNA-seq data analysis**, _Briefings in Bioinformatics_, 2012 -- Schulze, Kanwar, Golzenleuchter et al, **SERE: Single-parameter quality control and sample comparison for RNA-Seq**, _BMC Genomics_, 2012 -- McCarthy, Chen and Smyth, **Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation**, _Nucleic Acids Research_, 2012 -- Wu, Wang and Wu, **A new shrinkage estimator for dispersion improves differential expression detection in RNA-seq data**, _Biostatistics_, 2013 -- Benjamini and Hochberg, **Controlling the False Discovery Rate : A Practical and Powerful Approach to Multiple Testing**, _Journal of the Royal Statistical Society_, 1995 -- Benjamini and Yekutieli, **The control of the false discovery rate in multiple testing under dependency**, _The Annals of Statistics_, 2001 +# Bibliography diff --git a/template_script_DESeq2.r b/template_script_DESeq2.r index afc2dfe..9f02c84 100755 --- a/template_script_DESeq2.r +++ b/template_script_DESeq2.r @@ -1,8 +1,8 @@ ################################################################################ ### R script to compare several conditions with the SARTools and DESeq2 packages ### Hugo Varet -### May 2nd, 2017 -### designed to be executed with SARTools 1.4.1 +### June 1st, 2017 +### designed to be executed with SARTools 1.5.0 ################################################################################ ################################################################################ diff --git a/template_script_DESeq2_CL.r b/template_script_DESeq2_CL.r index 2b29d45..60316d8 100755 --- a/template_script_DESeq2_CL.r +++ b/template_script_DESeq2_CL.r @@ -1,184 +1,184 @@ -################################################################################ -### R script to compare several conditions with the SARTools and DESeq2 packages -### Hugo Varet -### May 2nd, 2017 -### designed to be executed with SARTools 1.4.1 -### run "Rscript template_script_DESeq2_CL.r --help" to get some help -################################################################################ - -rm(list=ls()) # remove all the objects from the R session -library(optparse) # to run the script in command lines - -# options list with associated default value. -option_list <- list( -make_option(c("-P", "--projectName"), - default=basename(getwd()), - dest="projectName", - help="name of the project used for the report [default: name of the current directory]."), - -make_option(c("-A", "--author"), - default=Sys.info()[7], - dest="author", - help="name of the report author [default: %default]."), - -make_option(c("-t", "--targetFile"), - default="target.txt", - dest="targetFile", - help="path to the design/target file [default: %default]."), - -make_option(c("-r", "--rawDir"), - default="raw", - dest="rawDir", - help="path to the directory containing the HTSeq files [default: %default]."), - -make_option(c("-F", "--featuresToRemove"), - default="alignment_not_unique,ambiguous,no_feature,not_aligned,too_low_aQual", - dest="FTR", - help="names of the features to be removed, more than once can be specified [default: %default]"), - -make_option(c("-v", "--varInt"), - default="group", - dest="varInt", - help="factor of interest [default: %default]"), - -make_option(c("-c", "--condRef"), - default="WT", - dest="condRef", - help="reference biological condition [default: %default]"), - -make_option(c("-b", "--batch"), - default=NULL, - dest="batch", - help="blocking factor [default: %default] or \"batch\" for example"), - -make_option(c("-f", "--fitType"), - default="parametric", - dest="fitType", - help="mean-variance relationship: [default: %default] or local"), - -make_option(c("-o", "--cooksCutoff"), - default=TRUE, - dest="cooksCutoff", - help="perform the outliers detection (default is TRUE)"), - -make_option(c("-i", "--independentFiltering"), - default=TRUE, - dest="independentFiltering", - help="perform independent filtering (default is TRUE)"), - -make_option(c("-a", "--alpha"), - default=0.05, - dest="alpha", - help="threshold of statistical significance [default: %default]"), - -make_option(c("-p", "--pAdjustMethod"), - default="BH", - dest="pAdjustMethod", - help="p-value adjustment method: \"BH\" or \"BY\" [default: %default]"), - -make_option(c("-T", "--typeTrans"), - default="VST", - dest="typeTrans", - help="transformation for PCA/clustering: \"VST\" ou \"rlog\" [default: %default]"), - -make_option(c("-l", "--locfunc"), - default="median", - dest="locfunc", - help="median or shorth to estimate the size factors [default: %default]"), - -make_option(c("-C", "--colors"), - default="dodgerblue,firebrick1,MediumVioletRed,SpringGreen,chartreuse,cyan,darkorchid,darkorange", - dest="cols", - help="colors of each biological condition on the plots\n\t\t\"col1,col2,col3,col4\"\n\t\t[default: %default]") -) - -# now parse the command line to check which option is given and get associated values -parser <- OptionParser(usage="usage: %prog [options]", - option_list=option_list, - description="Compare two or more biological conditions in a RNA-Seq framework with DESeq2.", - epilogue="For comments, bug reports etc... please contact Hugo Varet ") -opt <- parse_args(parser, args=commandArgs(trailingOnly=TRUE), positional_arguments=0)$options - -# get options and arguments -workDir <- getwd() -projectName <- opt$projectName # name of the project -author <- opt$author # author of the statistical analysis/report -targetFile <- opt$targetFile # path to the design/target file -rawDir <- opt$rawDir # path to the directory containing raw counts files -featuresToRemove <- unlist(strsplit(opt$FTR, ",")) # names of the features to be removed (specific HTSeq-count information and rRNA for example) -varInt <- opt$varInt # factor of interest -condRef <- opt$condRef # reference biological condition -batch <- opt$batch # blocking factor: NULL (default) or "batch" for example -fitType <- opt$fitType # mean-variance relationship: "parametric" (default) or "local" -cooksCutoff <- opt$cooksCutoff # outliers detection threshold (NULL to let DESeq2 choosing it) -independentFiltering <- opt$independentFiltering # TRUE/FALSE to perform independent filtering (default is TRUE) -alpha <- as.numeric(opt$alpha) # threshold of statistical significance -pAdjustMethod <- opt$pAdjustMethod # p-value adjustment method: "BH" (default) or "BY" -typeTrans <- opt$typeTrans # transformation for PCA/clustering: "VST" ou "rlog" -locfunc <- opt$locfunc # "median" (default) or "shorth" to estimate the size factors -colors <- unlist(strsplit(opt$cols, ",")) # vector of colors of each biologicial condition on the plots - -# print(paste("workDir", workDir)) -# print(paste("projectName", projectName)) -# print(paste("author", author)) -# print(paste("targetFile", targetFile)) -# print(paste("rawDir", rawDir)) -# print(paste("varInt", varInt)) -# print(paste("condRef", condRef)) -# print(paste("batch", batch)) -# print(paste("fitType", fitType)) -# print(paste("cooksCutoff", cooksCutoff)) -# print(paste("independentFiltering", independentFiltering)) -# print(paste("alpha", alpha)) -# print(paste("pAdjustMethod", pAdjustMethod)) -# print(paste("typeTrans", typeTrans)) -# print(paste("locfunc", locfunc)) -# print(paste("featuresToRemove", featuresToRemove)) -# print(paste("colors", colors)) - -################################################################################ -### running script ### -################################################################################ -# setwd(workDir) -library(SARTools) - -# checking parameters -problem <- checkParameters.DESeq2(projectName=projectName,author=author,targetFile=targetFile, - rawDir=rawDir,featuresToRemove=featuresToRemove,varInt=varInt, - condRef=condRef,batch=batch,fitType=fitType,cooksCutoff=cooksCutoff, - independentFiltering=independentFiltering,alpha=alpha,pAdjustMethod=pAdjustMethod, - typeTrans=typeTrans,locfunc=locfunc,colors=colors) -if (problem) quit(save="yes") - -# loading target file -target <- loadTargetFile(targetFile=targetFile, varInt=varInt, condRef=condRef, batch=batch) - -# loading counts -counts <- loadCountData(target=target, rawDir=rawDir, featuresToRemove=featuresToRemove) - -# description plots -majSequences <- descriptionPlots(counts=counts, group=target[,varInt], col=colors) - -# analysis with DESeq2 -out.DESeq2 <- run.DESeq2(counts=counts, target=target, varInt=varInt, batch=batch, - locfunc=locfunc, fitType=fitType, pAdjustMethod=pAdjustMethod, - cooksCutoff=cooksCutoff, independentFiltering=independentFiltering, alpha=alpha) - -# PCA + clustering -exploreCounts(object=out.DESeq2$dds, group=target[,varInt], typeTrans=typeTrans, col=colors) - -# summary of the analysis (boxplots, dispersions, diag size factors, export table, nDiffTotal, histograms, MA plot) -summaryResults <- summarizeResults.DESeq2(out.DESeq2, group=target[,varInt], col=colors, - independentFiltering=independentFiltering, - cooksCutoff=cooksCutoff, alpha=alpha) - -# save image of the R session -save.image(file=paste0(projectName, ".RData")) - -# generating HTML report -writeReport.DESeq2(target=target, counts=counts, out.DESeq2=out.DESeq2, summaryResults=summaryResults, - majSequences=majSequences, workDir=workDir, projectName=projectName, author=author, - targetFile=targetFile, rawDir=rawDir, featuresToRemove=featuresToRemove, varInt=varInt, - condRef=condRef, batch=batch, fitType=fitType, cooksCutoff=cooksCutoff, - independentFiltering=independentFiltering, alpha=alpha, pAdjustMethod=pAdjustMethod, - typeTrans=typeTrans, locfunc=locfunc, colors=colors) +################################################################################ +### R script to compare several conditions with the SARTools and DESeq2 packages +### Hugo Varet +### June 1st, 2017 +### designed to be executed with SARTools 1.5.0 +### run "Rscript template_script_DESeq2_CL.r --help" to get some help +################################################################################ + +rm(list=ls()) # remove all the objects from the R session +library(optparse) # to run the script in command lines + +# options list with associated default value. +option_list <- list( +make_option(c("-P", "--projectName"), + default=basename(getwd()), + dest="projectName", + help="name of the project used for the report [default: name of the current directory]."), + +make_option(c("-A", "--author"), + default=Sys.info()[7], + dest="author", + help="name of the report author [default: %default]."), + +make_option(c("-t", "--targetFile"), + default="target.txt", + dest="targetFile", + help="path to the design/target file [default: %default]."), + +make_option(c("-r", "--rawDir"), + default="raw", + dest="rawDir", + help="path to the directory containing the HTSeq files [default: %default]."), + +make_option(c("-F", "--featuresToRemove"), + default="alignment_not_unique,ambiguous,no_feature,not_aligned,too_low_aQual", + dest="FTR", + help="names of the features to be removed, more than once can be specified [default: %default]"), + +make_option(c("-v", "--varInt"), + default="group", + dest="varInt", + help="factor of interest [default: %default]"), + +make_option(c("-c", "--condRef"), + default="WT", + dest="condRef", + help="reference biological condition [default: %default]"), + +make_option(c("-b", "--batch"), + default=NULL, + dest="batch", + help="blocking factor [default: %default] or \"batch\" for example"), + +make_option(c("-f", "--fitType"), + default="parametric", + dest="fitType", + help="mean-variance relationship: [default: %default] or local"), + +make_option(c("-o", "--cooksCutoff"), + default=TRUE, + dest="cooksCutoff", + help="perform the outliers detection (default is TRUE)"), + +make_option(c("-i", "--independentFiltering"), + default=TRUE, + dest="independentFiltering", + help="perform independent filtering (default is TRUE)"), + +make_option(c("-a", "--alpha"), + default=0.05, + dest="alpha", + help="threshold of statistical significance [default: %default]"), + +make_option(c("-p", "--pAdjustMethod"), + default="BH", + dest="pAdjustMethod", + help="p-value adjustment method: \"BH\" or \"BY\" [default: %default]"), + +make_option(c("-T", "--typeTrans"), + default="VST", + dest="typeTrans", + help="transformation for PCA/clustering: \"VST\" ou \"rlog\" [default: %default]"), + +make_option(c("-l", "--locfunc"), + default="median", + dest="locfunc", + help="median or shorth to estimate the size factors [default: %default]"), + +make_option(c("-C", "--colors"), + default="dodgerblue,firebrick1,MediumVioletRed,SpringGreen,chartreuse,cyan,darkorchid,darkorange", + dest="cols", + help="colors of each biological condition on the plots\n\t\t\"col1,col2,col3,col4\"\n\t\t[default: %default]") +) + +# now parse the command line to check which option is given and get associated values +parser <- OptionParser(usage="usage: %prog [options]", + option_list=option_list, + description="Compare two or more biological conditions in a RNA-Seq framework with DESeq2.", + epilogue="For comments, bug reports etc... please contact Hugo Varet ") +opt <- parse_args(parser, args=commandArgs(trailingOnly=TRUE), positional_arguments=0)$options + +# get options and arguments +workDir <- getwd() +projectName <- opt$projectName # name of the project +author <- opt$author # author of the statistical analysis/report +targetFile <- opt$targetFile # path to the design/target file +rawDir <- opt$rawDir # path to the directory containing raw counts files +featuresToRemove <- unlist(strsplit(opt$FTR, ",")) # names of the features to be removed (specific HTSeq-count information and rRNA for example) +varInt <- opt$varInt # factor of interest +condRef <- opt$condRef # reference biological condition +batch <- opt$batch # blocking factor: NULL (default) or "batch" for example +fitType <- opt$fitType # mean-variance relationship: "parametric" (default) or "local" +cooksCutoff <- opt$cooksCutoff # outliers detection threshold (NULL to let DESeq2 choosing it) +independentFiltering <- opt$independentFiltering # TRUE/FALSE to perform independent filtering (default is TRUE) +alpha <- as.numeric(opt$alpha) # threshold of statistical significance +pAdjustMethod <- opt$pAdjustMethod # p-value adjustment method: "BH" (default) or "BY" +typeTrans <- opt$typeTrans # transformation for PCA/clustering: "VST" ou "rlog" +locfunc <- opt$locfunc # "median" (default) or "shorth" to estimate the size factors +colors <- unlist(strsplit(opt$cols, ",")) # vector of colors of each biologicial condition on the plots + +# print(paste("workDir", workDir)) +# print(paste("projectName", projectName)) +# print(paste("author", author)) +# print(paste("targetFile", targetFile)) +# print(paste("rawDir", rawDir)) +# print(paste("varInt", varInt)) +# print(paste("condRef", condRef)) +# print(paste("batch", batch)) +# print(paste("fitType", fitType)) +# print(paste("cooksCutoff", cooksCutoff)) +# print(paste("independentFiltering", independentFiltering)) +# print(paste("alpha", alpha)) +# print(paste("pAdjustMethod", pAdjustMethod)) +# print(paste("typeTrans", typeTrans)) +# print(paste("locfunc", locfunc)) +# print(paste("featuresToRemove", featuresToRemove)) +# print(paste("colors", colors)) + +################################################################################ +### running script ### +################################################################################ +# setwd(workDir) +library(SARTools) + +# checking parameters +problem <- checkParameters.DESeq2(projectName=projectName,author=author,targetFile=targetFile, + rawDir=rawDir,featuresToRemove=featuresToRemove,varInt=varInt, + condRef=condRef,batch=batch,fitType=fitType,cooksCutoff=cooksCutoff, + independentFiltering=independentFiltering,alpha=alpha,pAdjustMethod=pAdjustMethod, + typeTrans=typeTrans,locfunc=locfunc,colors=colors) +if (problem) quit(save="yes") + +# loading target file +target <- loadTargetFile(targetFile=targetFile, varInt=varInt, condRef=condRef, batch=batch) + +# loading counts +counts <- loadCountData(target=target, rawDir=rawDir, featuresToRemove=featuresToRemove) + +# description plots +majSequences <- descriptionPlots(counts=counts, group=target[,varInt], col=colors) + +# analysis with DESeq2 +out.DESeq2 <- run.DESeq2(counts=counts, target=target, varInt=varInt, batch=batch, + locfunc=locfunc, fitType=fitType, pAdjustMethod=pAdjustMethod, + cooksCutoff=cooksCutoff, independentFiltering=independentFiltering, alpha=alpha) + +# PCA + clustering +exploreCounts(object=out.DESeq2$dds, group=target[,varInt], typeTrans=typeTrans, col=colors) + +# summary of the analysis (boxplots, dispersions, diag size factors, export table, nDiffTotal, histograms, MA plot) +summaryResults <- summarizeResults.DESeq2(out.DESeq2, group=target[,varInt], col=colors, + independentFiltering=independentFiltering, + cooksCutoff=cooksCutoff, alpha=alpha) + +# save image of the R session +save.image(file=paste0(projectName, ".RData")) + +# generating HTML report +writeReport.DESeq2(target=target, counts=counts, out.DESeq2=out.DESeq2, summaryResults=summaryResults, + majSequences=majSequences, workDir=workDir, projectName=projectName, author=author, + targetFile=targetFile, rawDir=rawDir, featuresToRemove=featuresToRemove, varInt=varInt, + condRef=condRef, batch=batch, fitType=fitType, cooksCutoff=cooksCutoff, + independentFiltering=independentFiltering, alpha=alpha, pAdjustMethod=pAdjustMethod, + typeTrans=typeTrans, locfunc=locfunc, colors=colors) diff --git a/template_script_edgeR.r b/template_script_edgeR.r index 490c10b..16503aa 100755 --- a/template_script_edgeR.r +++ b/template_script_edgeR.r @@ -1,8 +1,8 @@ ################################################################################ ### R script to compare several conditions with the SARTools and edgeR packages ### Hugo Varet -### May 2nd, 2017 -### designed to be executed with SARTools 1.4.1 +### June 1st, 2017 +### designed to be executed with SARTools 1.5.0 ################################################################################ ################################################################################ diff --git a/template_script_edgeR_CL.r b/template_script_edgeR_CL.r index 8af7f3a..f3b8b19 100755 --- a/template_script_edgeR_CL.r +++ b/template_script_edgeR_CL.r @@ -1,167 +1,167 @@ -################################################################################ -### R script to compare several conditions with the SARTools and edgeR packages -### Hugo Varet -### May 2nd, 2017 -### designed to be executed with SARTools 1.4.1 -### run "Rscript template_script_edgeR_CL.r --help" to get some help -################################################################################ - -rm(list=ls()) # remove all the objects from the R session -library(optparse) # to run the script in command lines - -# options list with associated default value. -option_list <- list( -make_option(c("-P", "--projectName"), - default=basename(getwd()), - dest="projectName", - help="name of the project used for the report [default: name of the current directory]."), - -make_option(c("-A", "--author"), - default=Sys.info()[7], - dest="author", - help="name of the report author [default: %default]."), - -make_option(c("-t", "--targetFile"), - default="target.txt", - dest="targetFile", - help="path to the design/target file [default: %default]."), - -make_option(c("-r", "--rawDir"), - default="raw", - dest="rawDir", - help="path to the directory containing the HTSeq files [default: %default]."), - -make_option(c("-F", "--featuresToRemove"), - default="alignment_not_unique,ambiguous,no_feature,not_aligned,too_low_aQual", - dest="FTR", - help="names of the features to be removed, more than once can be specified [default: %default]"), - -make_option(c("-v", "--varInt"), - default="group", - dest="varInt", - help="factor of interest [default: %default]"), - -make_option(c("-c", "--condRef"), - default="WT", - dest="condRef", - help="reference biological condition [default: %default]"), - -make_option(c("-b", "--batch"), - default=NULL, - dest="batch", - help="blocking factor [default: %default] or \"batch\" for example"), - -make_option(c("-a", "--alpha"), - default=0.05, - dest="alpha", - help="threshold of statistical significance [default: %default]"), - -make_option(c("-p", "--pAdjustMethod"), - default="BH", - dest="pAdjustMethod", - help="p-value adjustment method: \"BH\" or \"BY\" [default: %default]"), - -make_option(c("-m", "--cpmCutoff"), - default=1, - dest="cpmCutoff", - help="counts-per-million cut-off to filter low counts"), - -make_option(c("-g", "--gene.selection"), - default="pairwise", - dest="gene.selection", - help="selection of the features in MDSPlot [default: %default]"), - -make_option(c("-n", "--normalizationMethod"), - default="TMM", - dest="normalizationMethod", - help="normalization method in calcNormFactors: \"TMM\", \"RLE\" or \"upperquartile\" [default: %default]"), - -make_option(c("-C", "--colors"), - default="dodgerblue,firebrick1,MediumVioletRed,SpringGreen,chartreuse,cyan,darkorchid,darkorange", - dest="cols", - help="colors of each biological condition on the plots\n\t\t\"col1,col2,col3,col4\"\n\t\t[default: %default]") -) - -# now parse the command line to check which option is given and get associated values -parser <- OptionParser(usage="usage: %prog [options]", - option_list=option_list, - description="Compare two or more biological conditions in a RNA-Seq framework with edgeR.", - epilogue="For comments, bug reports etc... please contact Hugo Varet ") -opt <- parse_args(parser, args=commandArgs(trailingOnly=TRUE), positional_arguments=0)$options - -# get options and arguments -workDir <- getwd() -projectName <- opt$projectName # name of the project -author <- opt$author # author of the statistical analysis/report -targetFile <- opt$targetFile # path to the design/target file -rawDir <- opt$rawDir # path to the directory containing raw counts files -featuresToRemove <- unlist(strsplit(opt$FTR, ",")) # names of the features to be removed (specific HTSeq-count information and rRNA for example) -varInt <- opt$varInt # factor of interest -condRef <- opt$condRef # reference biological condition -batch <- opt$batch # blocking factor: NULL (default) or "batch" for example -alpha <- as.numeric(opt$alpha) # threshold of statistical significance -pAdjustMethod <- opt$pAdjustMethod # p-value adjustment method: "BH" (default) or "BY" -gene.selection <- opt$gene.selection # selection of the features in MDSPlot -normalizationMethod <- opt$normalizationMethod # normalization method in calcNormFactors -cpmCutoff <- opt$cpmCutoff # counts-per-million cut-off to filter low counts -colors <- unlist(strsplit(opt$cols, ",")) # vector of colors of each biologicial condition on the plots - -# print(paste("workDir", workDir)) -# print(paste("projectName", projectName)) -# print(paste("author", author)) -# print(paste("targetFile", targetFile)) -# print(paste("rawDir", rawDir)) -# print(paste("varInt", varInt)) -# print(paste("condRef", condRef)) -# print(paste("batch", batch)) -# print(paste("alpha", alpha)) -# print(paste("pAdjustMethod", pAdjustMethod)) -# print(paste("featuresToRemove", featuresToRemove)) -# print(paste("colors", colors)) -# print(paste("gene.selection", gene.selection)) -# print(paste("normalizationMethod", normalizationMethod)) -# print(paste("cpmCutoff", cpmCutoff)) - -################################################################################ -### running script ### -################################################################################ -# setwd(workDir) -library(SARTools) - -# checking parameters -problem <- checkParameters.edgeR(projectName=projectName,author=author,targetFile=targetFile, - rawDir=rawDir,featuresToRemove=featuresToRemove,varInt=varInt, - condRef=condRef,batch=batch,alpha=alpha,pAdjustMethod=pAdjustMethod, - cpmCutoff=cpmCutoff,gene.selection=gene.selection, - normalizationMethod=normalizationMethod,colors=colors) -if (problem) quit(save="yes") - -# loading target file -target <- loadTargetFile(targetFile=targetFile, varInt=varInt, condRef=condRef, batch=batch) - -# loading counts -counts <- loadCountData(target=target, rawDir=rawDir, featuresToRemove=featuresToRemove) - -# description plots -majSequences <- descriptionPlots(counts=counts, group=target[,varInt], col=colors) - -# edgeR analysis -out.edgeR <- run.edgeR(counts=counts, target=target, varInt=varInt, condRef=condRef, - batch=batch, cpmCutoff=cpmCutoff, normalizationMethod=normalizationMethod, - pAdjustMethod=pAdjustMethod) - -# MDS + clustering -exploreCounts(object=out.edgeR$dge, group=target[,varInt], gene.selection=gene.selection, col=colors) - -# summary of the analysis (boxplots, dispersions, export table, nDiffTotal, histograms, MA plot) -summaryResults <- summarizeResults.edgeR(out.edgeR, group=target[,varInt], counts=counts, alpha=alpha, col=colors) - -# save image of the R session -save.image(file=paste0(projectName, ".RData")) - -# generating HTML report -writeReport.edgeR(target=target, counts=counts, out.edgeR=out.edgeR, summaryResults=summaryResults, - majSequences=majSequences, workDir=workDir, projectName=projectName, author=author, - targetFile=targetFile, rawDir=rawDir, featuresToRemove=featuresToRemove, varInt=varInt, - condRef=condRef, batch=batch, alpha=alpha, pAdjustMethod=pAdjustMethod, colors=colors, - gene.selection=gene.selection, normalizationMethod=normalizationMethod) +################################################################################ +### R script to compare several conditions with the SARTools and edgeR packages +### Hugo Varet +### June 1st, 2017 +### designed to be executed with SARTools 1.5.0 +### run "Rscript template_script_edgeR_CL.r --help" to get some help +################################################################################ + +rm(list=ls()) # remove all the objects from the R session +library(optparse) # to run the script in command lines + +# options list with associated default value. +option_list <- list( +make_option(c("-P", "--projectName"), + default=basename(getwd()), + dest="projectName", + help="name of the project used for the report [default: name of the current directory]."), + +make_option(c("-A", "--author"), + default=Sys.info()[7], + dest="author", + help="name of the report author [default: %default]."), + +make_option(c("-t", "--targetFile"), + default="target.txt", + dest="targetFile", + help="path to the design/target file [default: %default]."), + +make_option(c("-r", "--rawDir"), + default="raw", + dest="rawDir", + help="path to the directory containing the HTSeq files [default: %default]."), + +make_option(c("-F", "--featuresToRemove"), + default="alignment_not_unique,ambiguous,no_feature,not_aligned,too_low_aQual", + dest="FTR", + help="names of the features to be removed, more than once can be specified [default: %default]"), + +make_option(c("-v", "--varInt"), + default="group", + dest="varInt", + help="factor of interest [default: %default]"), + +make_option(c("-c", "--condRef"), + default="WT", + dest="condRef", + help="reference biological condition [default: %default]"), + +make_option(c("-b", "--batch"), + default=NULL, + dest="batch", + help="blocking factor [default: %default] or \"batch\" for example"), + +make_option(c("-a", "--alpha"), + default=0.05, + dest="alpha", + help="threshold of statistical significance [default: %default]"), + +make_option(c("-p", "--pAdjustMethod"), + default="BH", + dest="pAdjustMethod", + help="p-value adjustment method: \"BH\" or \"BY\" [default: %default]"), + +make_option(c("-m", "--cpmCutoff"), + default=1, + dest="cpmCutoff", + help="counts-per-million cut-off to filter low counts"), + +make_option(c("-g", "--gene.selection"), + default="pairwise", + dest="gene.selection", + help="selection of the features in MDSPlot [default: %default]"), + +make_option(c("-n", "--normalizationMethod"), + default="TMM", + dest="normalizationMethod", + help="normalization method in calcNormFactors: \"TMM\", \"RLE\" or \"upperquartile\" [default: %default]"), + +make_option(c("-C", "--colors"), + default="dodgerblue,firebrick1,MediumVioletRed,SpringGreen,chartreuse,cyan,darkorchid,darkorange", + dest="cols", + help="colors of each biological condition on the plots\n\t\t\"col1,col2,col3,col4\"\n\t\t[default: %default]") +) + +# now parse the command line to check which option is given and get associated values +parser <- OptionParser(usage="usage: %prog [options]", + option_list=option_list, + description="Compare two or more biological conditions in a RNA-Seq framework with edgeR.", + epilogue="For comments, bug reports etc... please contact Hugo Varet ") +opt <- parse_args(parser, args=commandArgs(trailingOnly=TRUE), positional_arguments=0)$options + +# get options and arguments +workDir <- getwd() +projectName <- opt$projectName # name of the project +author <- opt$author # author of the statistical analysis/report +targetFile <- opt$targetFile # path to the design/target file +rawDir <- opt$rawDir # path to the directory containing raw counts files +featuresToRemove <- unlist(strsplit(opt$FTR, ",")) # names of the features to be removed (specific HTSeq-count information and rRNA for example) +varInt <- opt$varInt # factor of interest +condRef <- opt$condRef # reference biological condition +batch <- opt$batch # blocking factor: NULL (default) or "batch" for example +alpha <- as.numeric(opt$alpha) # threshold of statistical significance +pAdjustMethod <- opt$pAdjustMethod # p-value adjustment method: "BH" (default) or "BY" +gene.selection <- opt$gene.selection # selection of the features in MDSPlot +normalizationMethod <- opt$normalizationMethod # normalization method in calcNormFactors +cpmCutoff <- opt$cpmCutoff # counts-per-million cut-off to filter low counts +colors <- unlist(strsplit(opt$cols, ",")) # vector of colors of each biologicial condition on the plots + +# print(paste("workDir", workDir)) +# print(paste("projectName", projectName)) +# print(paste("author", author)) +# print(paste("targetFile", targetFile)) +# print(paste("rawDir", rawDir)) +# print(paste("varInt", varInt)) +# print(paste("condRef", condRef)) +# print(paste("batch", batch)) +# print(paste("alpha", alpha)) +# print(paste("pAdjustMethod", pAdjustMethod)) +# print(paste("featuresToRemove", featuresToRemove)) +# print(paste("colors", colors)) +# print(paste("gene.selection", gene.selection)) +# print(paste("normalizationMethod", normalizationMethod)) +# print(paste("cpmCutoff", cpmCutoff)) + +################################################################################ +### running script ### +################################################################################ +# setwd(workDir) +library(SARTools) + +# checking parameters +problem <- checkParameters.edgeR(projectName=projectName,author=author,targetFile=targetFile, + rawDir=rawDir,featuresToRemove=featuresToRemove,varInt=varInt, + condRef=condRef,batch=batch,alpha=alpha,pAdjustMethod=pAdjustMethod, + cpmCutoff=cpmCutoff,gene.selection=gene.selection, + normalizationMethod=normalizationMethod,colors=colors) +if (problem) quit(save="yes") + +# loading target file +target <- loadTargetFile(targetFile=targetFile, varInt=varInt, condRef=condRef, batch=batch) + +# loading counts +counts <- loadCountData(target=target, rawDir=rawDir, featuresToRemove=featuresToRemove) + +# description plots +majSequences <- descriptionPlots(counts=counts, group=target[,varInt], col=colors) + +# edgeR analysis +out.edgeR <- run.edgeR(counts=counts, target=target, varInt=varInt, condRef=condRef, + batch=batch, cpmCutoff=cpmCutoff, normalizationMethod=normalizationMethod, + pAdjustMethod=pAdjustMethod) + +# MDS + clustering +exploreCounts(object=out.edgeR$dge, group=target[,varInt], gene.selection=gene.selection, col=colors) + +# summary of the analysis (boxplots, dispersions, export table, nDiffTotal, histograms, MA plot) +summaryResults <- summarizeResults.edgeR(out.edgeR, group=target[,varInt], counts=counts, alpha=alpha, col=colors) + +# save image of the R session +save.image(file=paste0(projectName, ".RData")) + +# generating HTML report +writeReport.edgeR(target=target, counts=counts, out.edgeR=out.edgeR, summaryResults=summaryResults, + majSequences=majSequences, workDir=workDir, projectName=projectName, author=author, + targetFile=targetFile, rawDir=rawDir, featuresToRemove=featuresToRemove, varInt=varInt, + condRef=condRef, batch=batch, alpha=alpha, pAdjustMethod=pAdjustMethod, colors=colors, + gene.selection=gene.selection, normalizationMethod=normalizationMethod)