diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index f4f4bcf74..dce808625 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -101,6 +101,30 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \ allelic_BAM_input/allelic_bams/sample5.genome2.sorted.bam.bai \ allelic_BAM_input/allelic_bams/sample6.genome1.sorted.bam.bai \ allelic_BAM_input/allelic_bams/sample6.genome2.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam.bai \ allelic_BAM_input/deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv \ allelic_BAM_input/filtered_bam/sample1.filtered.bam \ allelic_BAM_input/filtered_bam/sample2.filtered.bam \ @@ -197,18 +221,18 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 628 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --bigWigType log2ratio .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 562 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 759 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 815 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1203 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1259 ]; then exit 1 ; fi #noInput WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 403 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller Genrich .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 349 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 469 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 525 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 696 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 752 ]; then exit 1 ; fi # fromBAM WC=`ChIP-seq -d outdir --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --fromBAM BAM_input/filtered_bam/ .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1187 ]; then exit 1 ; fi @@ -314,6 +338,10 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi +WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 659 ]; then exit 1 ; fi + + WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1370 ]; then exit 1 ; fi diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 92d0696fb..c641cf909 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -127,4 +127,4 @@ jobs: run: | micromamba activate snakePipes_CI snakePipes config --tempDir /tmp - snakePipes createEnvs --autodetectCondaEnvDir --force --only ${{matrix.envs}} + snakePipes createEnvs --autodetectCondaEnvDir --only ${{matrix.envs}} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 2decc8c06..de0b1c84e 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -52,5 +52,5 @@ jobs: - name: createEnvsOSX run: | micromamba activate snakePipes_CI - snakePipes createEnvs --force --autodetectCondaEnvDir --only ${{matrix.envs}} + snakePipes createEnvs --autodetectCondaEnvDir --only ${{matrix.envs}} diff --git a/bin/snakePipes b/bin/snakePipes index 754aa0a37..f74a022dd 100755 --- a/bin/snakePipes +++ b/bin/snakePipes @@ -54,11 +54,11 @@ def parse_arguments(): "possible environments are: {}".format(cof.set_env_yamls().keys()), ) - createEnvsParser.add_argument( - "--force", - action="store_true", - help="Force creation of conda environments, even if they apparently exist.", - ) +# createEnvsParser.add_argument( +# "--force", +# action="store_true", +# help="Force creation of conda environments, even if they apparently exist.", +# ) createEnvsParser.add_argument( "--info", @@ -334,7 +334,6 @@ def createCondaEnvs(args): "mamba", "env", "create", - "--force", "--file", os.path.join(baseDir, "shared/rules", env), ] @@ -342,7 +341,7 @@ def createCondaEnvs(args): # Don't actually create the env if either --info is set or it already exists and --force is NOT set if not args.info: - if not os.path.exists(os.path.join(condaDirUse, h)) or args.force: + if not os.path.exists(os.path.join(condaDirUse, h)): try: os.makedirs(os.path.join(condaDirUse, h), exist_ok=True) subprocess.check_call(cmd) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index d7538294e..f9aabba44 100755 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,6 +1,6 @@ package: name: snakepipes - version: 2.8.1 + version: 2.8.2 source: path: ../ diff --git a/docs/content/News.rst b/docs/content/News.rst index bdda58274..fcbefb31e 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -1,5 +1,22 @@ snakePipes News =============== + +snakePipes 2.8.2 +________________ + +* added SEACR peaks qc +* added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows +* added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap +* added support for custom model formula to mRNA-seq workflow +* fixed copyfile command for sampleSheet +* removed deprecated --force argument from mamba commands +* fixes #998 +* fixes #997 +* fixes #996 +* fixes #994 +* fixes #1000 +* fixes #1001 + snakePipes 2.8.1 ---------------- * Boosted versions on shared_env, as python 3.7 and multiqc no longer work together. diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst index e26d52564..4393ea282 100644 --- a/docs/content/workflows/mRNA-seq.rst +++ b/docs/content/workflows/mRNA-seq.rst @@ -144,6 +144,9 @@ Like the other workflows, differential expression can be performed using the ``- .. note:: The first entry defines which group of samples are control. This way, the order of comparison and likewise the sign of values can be changed. The DE analysis might fail if your sample names begin with a number. So watch out for that! + +Optionally, the user may submit their desired model formula (without the leading tilda ``~``) with ``--formula``. + Differential Splicing --------------------- @@ -194,6 +197,11 @@ Allele-specific, gene-level differential expression analysis is then performed u .. note:: **allelic-mapping** mode is mutually exclusive with **mapping** mode +"allelic-counting" +~~~~~~~~~~~~~~~~~~ + +**allelic-counting** mode requires the user to input, per sample, 4 bam files, corresponding to haplotype1, haplotype2, unassigned and haplotagged , e.g. as generated by whatshap. The respective suffixes ".genome1", ".genome2", ".unassigned", ".alelle_flagged" are required to be followed by the bam extention ".sorted.bam". This mode is mutually exclusive with "deepTools_qc". Only the allelic version of deepTools qc will be run, by default. Allelic version of featureCounts will be run by default. If sample sheet is provided, allelic DESeq2- or allelic Salmon-based differential gene expression analysis will be run. + "alignment-free" ~~~~~~~~~~~~~~~~ diff --git a/snakePipes/__init__.py b/snakePipes/__init__.py index 80e22f7ae..964a32ab4 100755 --- a/snakePipes/__init__.py +++ b/snakePipes/__init__.py @@ -1 +1 @@ -__version__ = '2.8.1' +__version__ = '2.8.2' diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 8a5457cd1..abe36b324 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -173,6 +173,20 @@ def get_sample_names_bam(infiles, bamExt): return sorted(list(set(s))) +def get_sample_names_suffix_bam(infiles, bamExt): + """ + Get sample names without file extensions + """ + bamSuff = [x + bamExt for x in [".genome1", ".genome2", ".unassigned", ".allele_flagged"]] + s = [] + for x in infiles: + for y in bamSuff: + if y in os.path.basename(x): + x = os.path.basename(x).replace(y, "") + s.append(x) + return sorted(list(set(s))) + + def is_paired(infiles, ext, reads): """ Check for paired-end input files @@ -848,7 +862,7 @@ def copySampleSheet(sampleSheet, wdir): if os.path.isfile(sampleSheet) and os.path.exists(wdir): bname = os.path.basename(sampleSheet) try: - shutil.copy(sampleSheet, os.path.join(wdir, bname)) + shutil.copyfile(sampleSheet, os.path.join(wdir, bname)) except Exception as err: print("Unexpected error:\n{}".format(err)) raise diff --git a/snakePipes/parserCommon.py b/snakePipes/parserCommon.py index 0042ce5da..cf0280d43 100644 --- a/snakePipes/parserCommon.py +++ b/snakePipes/parserCommon.py @@ -168,7 +168,8 @@ def snpArguments(defaults): snpargs.add_argument("--NMaskedIndex", default='', - help="N-masked index of the reference genome (default: 'None')") + help="N-masked index of the reference genome (default: 'None'). " + "Note that this should point to a file (i.e. 'Genome' for STAR indices, genome.1.bt2 for bowtie2 indices).") return parser diff --git a/snakePipes/shared/defaults.yaml b/snakePipes/shared/defaults.yaml index 1975203cb..f35badc8a 100755 --- a/snakePipes/shared/defaults.yaml +++ b/snakePipes/shared/defaults.yaml @@ -7,7 +7,7 @@ # permitted here. ################################################################################ # -condaEnvDir: '/package/mamba/envs/snakepipesenvs/2.8.1' +condaEnvDir: '/package/mamba/envs/snakepipesenvs/2.8.2' snakemakeOptions: '' organismsDir: 'shared/organisms' clusterConfig: 'shared/cluster.yaml' diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 44efdb091..150941f78 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -15,6 +15,9 @@ outdir<-snakemake@params[["outdir"]] yaml_path<-snakemake@params[["yaml_path"]] useSpikeInForNorm<-snakemake@params[["useSpikeInForNorm"]] scale_factors<-snakemake@params[["scale_factors"]] +external_bed<-as.logical(snakemake@params[["externalBed"]]) + + bam_pfx<-ifelse(useSpikeInForNorm,"_host",".filtered") bam_folder<-ifelse(useSpikeInForNorm,"split_bam","filtered_bam") @@ -42,6 +45,7 @@ message(paste("FDR:", fdr, "\n")) message(paste("LFC:", lfc, "\n")) message(paste("paired-end? :", pairedEnd, "\n")) message(paste("allele-specific? :", allelic_info, "\n")) +message(paste("External bed? ;", external_bed, "\n")) ## sampleInfo (setup of the experiment) sampleInfo <- read.table(sampleInfoFilePath, header = TRUE, colClasses = c("character", "character")) @@ -98,35 +102,43 @@ if (!is.null(sampleInfo$UseRegions)) { fnames<-sampleInfo$name } -if(snakemake@params[['peakCaller']] == "MACS2") { - allpeaks <- lapply(fnames, function(x) { - narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx - if(snakemake@params[["pipeline"]] %in% "ATAC-seq"){ - narrow <- paste0("../MACS2/",x,".filtered.short.BAM_peaks.narrowPeak") - } - broad <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.broadPeak") #bam_pfx - # first look for narrowpeak then braod peak - if(file.exists(narrow)) { - bed <- read.delim(narrow, header = FALSE) - } else if (file.exists(broad)) { - bed <- read.delim(broad, header = FALSE) - } else { - stop("MACS2 output doesn't exist. Neither ", narrow, " , nor ", broad) - } - - bed.gr <- GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) - return(bed.gr) - }) -} else { - allpeaks = lapply(snakemake@input[['peaks']], function(x) { - bed = read.delim(paste0("../", x), header=FALSE) +if (! external_bed) { + if(snakemake@params[['peakCaller']] == "MACS2") { + allpeaks <- lapply(fnames, function(x) { + narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx + if(snakemake@params[["pipeline"]] %in% "ATAC-seq"){ + narrow <- paste0("../MACS2/",x,".filtered.short.BAM_peaks.narrowPeak") + } + broad <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.broadPeak") #bam_pfx + #first look for narrowpeak then braod peak + if(file.exists(narrow)) { + bed <- read.delim(narrow, header = FALSE) + } else if (file.exists(broad)) { + bed <- read.delim(broad, header = FALSE) + } else { + stop("MACS2 output doesn't exist. Neither ", narrow, " , nor ", broad) + } + + bed.gr <- GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) + return(bed.gr) + }) + } else { + allpeaks = lapply(snakemake@input[['peaks']], function(x) { + bed = read.delim(paste0("../", x), header=FALSE) + bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) + return(bed.gr) + }) + # merge + allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) + } + } else { + bed = read.delim(snakemake@input[['peaks']],header=FALSE) bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) - return(bed.gr) - }) + allpeaks <- bed.gr } # merge -allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) +#allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) ## keep only these peaks for testing DB message(paste0("Filtering windows using MACS2 output : ", length(allpeaks) , " regions used (Union of peaks)")) @@ -165,4 +177,3 @@ sink("CSAW.session_info.txt") sessionInfo() sink() -print("DONE..!") diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 4f76297b1..d762edd34 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -9,28 +9,47 @@ # args 5 : path to DE_functions # args 6 : T/F whether or not the workflow is allele-sepecific # args 7 : tx2gene file for salmon --> DESeq mode +# args 9: model formula .libPaths(R.home("library")) -args = commandArgs(TRUE) +#args = commandArgs(TRUE) ## re invented RNaseq workflow -sampleInfoFilePath <- args[1] -countFilePath <- args[2] -fdr <- as.numeric(args[3]) -geneNamesFilePath <- args[4] -importfunc <- args[5] -allelic_info <- as.logical(args[6]) +#sampleInfoFilePath <- args[1] +#countFilePath <- args[2] +#fdr <- as.numeric(args[3]) +#geneNamesFilePath <- args[4] +#importfunc <- args[5] +#allelic_info <- as.logical(args[6]) ## if output is from salmon then tx2gene file should be present -tx2gene_file <- args[7] +#tx2gene_file <- args[7] + +sampleInfoFilePath <- snakemake@params[["sampleSheet"]] +countFilePath <- snakemake@params[["counts_table"]] +fdr <- as.numeric(snakemake@params[["fdr"]]) +geneNamesFilePath <- snakemake@params[["symbol_file"]] +importfunc <- snakemake@params[["importfunc"]] +allelic_info <- as.logical(snakemake@params[["allele_info"]]) +tx2gene_file <- snakemake@params[["tx2gene_file"]] +rmdTemplate <- snakemake@params[["rmdTemplate"]] +formulaInput <- snakemake@params[["formula"]] +wdir <- snakemake@params[["outdir"]] + +setwd(wdir) + + if(file.exists(tx2gene_file)) { tximport <- TRUE } else { tximport <- FALSE } -rmdTemplate <- args[8] +#rmdTemplate <- args[8] + +#formulaInput <- args[9] + topN <- 50 ## include functions suppressPackageStartupMessages(library(ggplot2)) @@ -49,6 +68,7 @@ cat(paste("Working dir:", getwd(), "\n")) cat(paste("Sample info CSV:", sampleInfoFilePath, "\n")) cat(paste("Count file:", countFilePath, "\n")) cat(paste("FDR:", fdr, "\n")) +cat(paste("Custom formula:",formulaInput,"\n")) cat(paste("Gene names:", geneNamesFilePath, "\n")) cat(paste("Number of top N genes:", topN, "\n")) cat(paste("Salmon --> DESeq2 : ", tximport, "\n")) @@ -96,7 +116,7 @@ if(length(unique(sampleInfo$condition))>1){ if(tximport & allelic_info){ message("Detected allelic Salmon counts. Skipping DESeq_basic.") }else{ - seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport) + seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport, customFormula = formulaInput) DESeq_writeOutput(DEseqout = seqout, fdr = fdr, outprefix = "DEseq_basic", diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index de55cab74..0395465d1 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -92,9 +92,15 @@ checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, #' #' -DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA) { +DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA, customFormula=NA) { cnames.sub<-unique(colnames(coldata)[2:which(colnames(coldata) %in% "condition")]) - d<-as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))) + + if(is.na(customFormula)){ + d<-as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))) + } else { + + d<-as.formula(paste0("~",customFormula)) + } # Normal DESeq @@ -123,9 +129,9 @@ DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_sa } dds <- DESeq2::DESeq(dds) ddr <- DESeq2::results(dds, alpha = fdr) - c1<-unique(coldata$condition)[1] - c2<-unique(coldata$condition)[2] - ddr_shrunk <- DESeq2::lfcShrink(dds,coef=paste0("condition_",c2,"_vs_",c1),type="apeglm",res=ddr) + a <- DESeq2::resultsNames(dds) + auto_coef <- a[length(a)] + ddr_shrunk <- DESeq2::lfcShrink(dds,coef=auto_coef,type="apeglm",res=ddr) output <- list(dds = dds, ddr = ddr, ddr_shrunk = ddr_shrunk) return(output) diff --git a/snakePipes/shared/rscripts/nearestGene.R b/snakePipes/shared/rscripts/nearestGene.R index ee98c0d50..12fdac694 100644 --- a/snakePipes/shared/rscripts/nearestGene.R +++ b/snakePipes/shared/rscripts/nearestGene.R @@ -30,9 +30,9 @@ if(any(is.na(size_v),sum(size_v==0)>0)){message('Some of the input files are non gs_tab<-data.table::fread(gene_symbol,header=FALSE) if(pipeline %in% c("chip-seq","ATAC-seq")){ - ibed_tab$GeneID<-t2g_tab$V2[match(ibed_tab$V19,t2g_tab$V1)] + ibed_tab$GeneID<-t2g_tab$V2[match(ibed_tab$V22,t2g_tab$V1)] ibed_tab$GeneSymbol<-gs_tab$V2[match(ibed_tab$GeneID,gs_tab$V1)] - obed_tab<-unique(subset(ibed_tab,select=c(paste0("V",c(1:18,20,21)),"GeneID","GeneSymbol"))) + obed_tab<-unique(subset(ibed_tab,select=c(paste0("V",c(1:18,23,24)),"GeneID","GeneSymbol"))) colnames(obed_tab)<-c("Chromosome","Start","End","Width","Strand","Score","nWindows","logFC.up","logFC.down","PValue","FDR","direction","rep.test","rep.logFC","best.logFC","best.test","best.start","Name","GeneStrand","Distance","GeneID","GeneSymbol") } diff --git a/snakePipes/shared/rules/CSAW.multiComp.snakefile b/snakePipes/shared/rules/CSAW.multiComp.snakefile index a451b1b73..fe9f6d124 100644 --- a/snakePipes/shared/rules/CSAW.multiComp.snakefile +++ b/snakePipes/shared/rules/CSAW.multiComp.snakefile @@ -22,8 +22,10 @@ def getInputPeaks(peakCaller, chip_samples, genrichDict,comp_group): return expand("SEACR/{chip_sample}_host.stringent.bed",chip_sample=chip_samples) elif pipeline == "chip-seq" and not useSpikeInForNorm: return expand("SEACR/{chip_sample}.filtered.stringent.bed",chip_sample=chip_samples) - else: + elif peakCaller == "Genrich": return expand("Genrich/{genrichGroup}.{{compGroup}}.narrowPeak", genrichGroup = genrichDict[comp_group].keys()) + elif externalBed: + return externalBed def getSizeMetrics(): @@ -103,7 +105,8 @@ rule CSAW: insert_size_metrics = lambda wildcards,input: os.path.join(outdir, input.insert_size_metrics) if pairedEnd else [], pipeline = pipeline, useSpikeInForNorm = useSpikeInForNorm, - scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "" + scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "", + externalBed = True if externalBed else False log: out = os.path.join(outdir, "{}/logs/CSAW.out".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))), err = os.path.join(outdir, "{}/logs/CSAW.err".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))) diff --git a/snakePipes/shared/rules/CSAW.singleComp.snakefile b/snakePipes/shared/rules/CSAW.singleComp.snakefile index d4d24693e..013732eee 100644 --- a/snakePipes/shared/rules/CSAW.singleComp.snakefile +++ b/snakePipes/shared/rules/CSAW.singleComp.snakefile @@ -16,8 +16,10 @@ def getInputPeaks(peakCaller, chip_samples, genrichDict): return expand("SEACR/{chip_sample}_host.stringent.bed",chip_sample=chip_samples) elif pipeline == "chip-seq" and not useSpikeInForNorm: return expand("SEACR/{chip_sample}.filtered.stringent.bed",chip_sample=chip_samples) - else: + elif peakCaller == "Genrich": return expand("Genrich/{genrichGroup}.narrowPeak", genrichGroup = genrichDict.keys()) + elif externalBed: + return externalBed def getSizeMetrics(): @@ -87,7 +89,8 @@ rule CSAW: insert_size_metrics = lambda wildcards,input: os.path.join(outdir, input.insert_size_metrics) if pairedEnd else [], pipeline = pipeline, useSpikeInForNorm = useSpikeInForNorm, - scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "" + scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "", + externalBed = True if externalBed else False log: out = os.path.join(outdir, "CSAW_{}_{}/logs/CSAW.out".format(peakCaller, sample_name)), err = os.path.join(outdir, "CSAW_{}_{}/logs/CSAW.err".format(peakCaller, sample_name)) diff --git a/snakePipes/shared/rules/ChIP_peak_calling.snakefile b/snakePipes/shared/rules/ChIP_peak_calling.snakefile index 687a4c708..fca12caba 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling.snakefile @@ -170,10 +170,11 @@ if not isMultipleComparison: bams = lambda wildcards: ",".join(expand(os.path.join("filtered_bam", "{sample}.namesorted.bam"), sample=genrichDict[wildcards.group])), blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", - control=lambda wildcards,input: ",".join(input.control) if input.control else "" + control=lambda wildcards,input: ",".join(input.control) if input.control else "", + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -188,10 +189,11 @@ if not isMultipleComparison: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - frag_size=fragmentLength + frag_size=fragmentLength, + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ else: if pairedEnd: @@ -206,10 +208,11 @@ else: bams = lambda wildcards: ",".join(expand(os.path.join("filtered_bam", "{sample}.namesorted.bam"), sample=genrichDict[wildcards.compGroup][wildcards.group])), blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", - control=lambda wildcards,input: ",".join(input.control) if input.control else "" + control=lambda wildcards,input: ",".join(input.control) if input.control else "", + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -224,10 +227,11 @@ else: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - frag_size=fragmentLength + frag_size=fragmentLength, + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ @@ -253,10 +257,43 @@ rule SEACR_peaks: "SEACR/{chip_sample}.filtered.stringent.bed" log: "SEACR/logs/{chip_sample}.log" params: - fdr = fdr, + fdr = lambda wildcards,input: fdr if not input.control else "", prefix = os.path.join(outdir,"SEACR/{chip_sample}.filtered"), script=os.path.join(maindir, "shared","tools/SEACR-1.3/SEACR_1.3.sh") conda: CONDA_SEACR_ENV shell: """ bash {params.script} {input.chip} {input.control} {params.fdr} "norm" "stringent" {params.prefix} 2>{log} """ + +rule SEACR_peak_qc: + input: + bam = "filtered_bam/{sample}.filtered.bam", + peaks = "SEACR/{sample}.filtered.stringent.bed" + output: + qc = "SEACR/{sample}.filtered.stringent_peaks.qc.txt" + params: + genome_index = genome_index + benchmark: + "SEACR/.benchmark/SEACR_peak_qc.{sample}.filtered.benchmark" + conda: CONDA_SHARED_ENV + shell: """ + # get the number of peaks + peak_count=`wc -l < {input.peaks}` + + # get the number of mapped reads + mapped_reads=`samtools view -c -F 4 {input.bam}` + + #calculate the number of alignments overlapping the peaks + # exclude reads flagged as unmapped (unmapped reads will be reported when using -L) + reads_in_peaks=`samtools view -c -F 4 -L {input.peaks} {input.bam}` + + # calculate Fraction of Reads In Peaks + frip=`bc -l <<< "$reads_in_peaks/$mapped_reads"` + # compute peak genome coverage + peak_len=`awk '{{total+=$3-$2}}END{{print total}}' {input.peaks}` + genome_size=`awk '{{total+=$3-$2}}END{{print total}}' {params.genome_index}` + genomecov=`bc -l <<< "$peak_len/$genome_size"` + + # write peak-based QC metrics to output file + printf "peak_count\tFRiP\tpeak_genome_coverage\n%d\t%5.3f\t%6.4f\n" $peak_count $frip $genomecov > {output.qc} + """ diff --git a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile index 0120984ce..51b083d43 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile @@ -175,10 +175,10 @@ if not isMultipleComparison: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = '-e ' + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -194,10 +194,10 @@ if not isMultipleComparison: control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", frag_size=fragmentLength, - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ else: if pairedEnd: @@ -213,10 +213,10 @@ else: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -232,10 +232,10 @@ else: control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", frag_size=fragmentLength, - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ @@ -257,10 +257,46 @@ rule SEACR_peaks: "SEACR/{chip_sample}_host.stringent.bed" log: "SEACR/logs/{chip_sample}.log" params: - fdr = fdr, + fdr = lambda wildcards,input: fdr if not input.control else "", prefix = os.path.join(outdir,"SEACR/{chip_sample}_host"), script=os.path.join(maindir, "shared","tools/SEACR-1.3/SEACR_1.3.sh") conda: CONDA_SEACR_ENV shell: """ bash {params.script} {input.chip} {input.control} {params.fdr} "non" "stringent" {params.prefix} 2>{log} """ + + +rule SEACR_peak_qc: + input: + bam = "split_bam/{sample}_host.bam", + peaks = "SEACR/{sample}_host.stringent.bed" + output: + qc = "SEACR/{sample}_host.stringent_peaks.qc.txt" + params: + genome_index = genome_index + benchmark: + "SEACR/.benchmark/SEACR_peak_qc.{sample}_host.benchmark" + conda: CONDA_SHARED_ENV + shell: """ + # get the number of peaks + peak_count=`wc -l < {input.peaks}` + + #get the number of mapped reads + mapped_reads=`samtools view -c -F 4 {input.bam}` + + #calculate the number of alignments overlapping the peaks + # exclude reads flagged as unmapped (unmapped reads will be reported when using -L) + reads_in_peaks=`samtools view -c -F 4 -L {input.peaks} {input.bam}` + + # calculate Fraction of Reads In Peaks + frip=`bc -l <<< "$reads_in_peaks/$mapped_reads"` + # compute peak genome coverage + peak_len=`awk '{{total+=$3-$2}}END{{print total}}' {input.peaks}` + genome_size=`awk '{{total+=$3-$2}}END{{print total}}' {params.genome_index}` + genomecov=`bc -l <<< "$peak_len/$genome_size"` + + # write peak-based QC metrics to output file + printf "peak_count\tFRiP\tpeak_genome_coverage\n%d\t%5.3f\t%6.4f\n" $peak_count $frip $genomecov > {output.qc} + """ + + diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index f60a60182..afb418e9e 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -34,24 +34,15 @@ rule DESeq2: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{params.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "{params.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" - + script: "{params.script}" ## DESeq2 (on Salmon) rule DESeq2_Salmon_basic: @@ -74,18 +65,10 @@ rule DESeq2_Salmon_basic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', - tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{params.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index c6822e41b..2e506fd5b 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -7,7 +7,7 @@ def get_outdir(folder_name,sampleSheet): ## DESeq2 (on featureCounts) rule DESeq2: input: - counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv", + counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode or "allelic-counting" in mode else "featureCounts/counts.tsv", sampleSheet = sampleSheet, symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file output: @@ -15,30 +15,22 @@ rule DESeq2: benchmark: "{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",sampleSheet)) params: - script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"), + script = os.path.join(maindir, "shared", "rscripts", "DESeq2.R"), + sampleSheet = lambda wildcards,input: input.sampleSheet, outdir = get_outdir("DESeq2",sampleSheet), fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), - allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', + allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode or "allelic-counting" in mode else 'FALSE', tx2gene_file = 'NA', - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "{params.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" - + script: "{params.script}" ## DESeq2 (on Salmon) rule DESeq2_Salmon_basic: @@ -60,21 +52,14 @@ rule DESeq2_Salmon_basic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', - tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" + rule DESeq2_Salmon_allelic: input: @@ -95,18 +80,10 @@ rule DESeq2_Salmon_allelic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'TRUE', - tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index 0514049b6..ea498b0f4 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -1,49 +1,74 @@ import os -rule link_bam: - input: - indir + "/{sample}" + bamExt - output: - aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam" - params: - input_bai = indir + "/{sample}" + bamExt + ".bai", - output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai" - run: - if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): - os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) - if not os.path.exists(os.path.join(outdir,output[0])): - os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) - -if not pipeline=="noncoding-rna-seq": +if pipeline=="rna-seq" and "allelic-counting" in mode: + rule link_bam: + input: + indir + "/{sample}.{suffix}" + bamExt + output: + "allelic_bams/{sample}.{suffix}" + bamExt + params: + input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai", + output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai" + run: + if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): + os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) + if not os.path.exists(os.path.join(outdir,output[0])): + os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) + rule samtools_index_external: input: - aligner + "/{sample}.bam" + "allelic_bams/{sample}.{suffix}" + bamExt output: - aligner + "/{sample}.bam.bai" + "allelic_bams/{sample}.{suffix}" + bamExt + ".bai" conda: CONDA_SHARED_ENV shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi" - if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC: - rule link_bam_bai_external: + +else: + rule link_bam: + input: + indir + "/{sample}" + bamExt + output: + aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam" + params: + input_bai = indir + "/{sample}" + bamExt + ".bai", + output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai" + run: + if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): + os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) + if not os.path.exists(os.path.join(outdir,output[0])): + os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) + + if not pipeline=="noncoding-rna-seq": + rule samtools_index_external: input: - bam = aligner + "/{sample}.bam", - bai = aligner + "/{sample}.bam.bai" + aligner + "/{sample}.bam" output: - bam_out = "filtered_bam/{sample}.filtered.bam", - bai_out = "filtered_bam/{sample}.filtered.bam.bai", - shell: """ - ln -s ../{input.bam} {output.bam_out}; - ln -s ../{input.bai} {output.bai_out} - """ - - - rule sambamba_flagstat: - input: - aligner + "/{sample}.bam" - output: - "Sambamba/{sample}.markdup.txt" - log: "Sambamba/logs/{sample}.flagstat.log" - conda: CONDA_SAMBAMBA_ENV - shell: """ - sambamba flagstat -p {input} > {output} 2> {log} - """ + aligner + "/{sample}.bam.bai" + conda: CONDA_SHARED_ENV + shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi" + + if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC: + rule link_bam_bai_external: + input: + bam = aligner + "/{sample}.bam", + bai = aligner + "/{sample}.bam.bai" + output: + bam_out = "filtered_bam/{sample}.filtered.bam", + bai_out = "filtered_bam/{sample}.filtered.bam.bai", + shell: """ + ln -s ../{input.bam} {output.bam_out}; + ln -s ../{input.bai} {output.bai_out} + """ + + + rule sambamba_flagstat: + input: + aligner + "/{sample}.bam" + output: + "Sambamba/{sample}.markdup.txt" + log: "Sambamba/logs/{sample}.flagstat.log" + conda: CONDA_SAMBAMBA_ENV + shell: """ + sambamba flagstat -p {input} > {output} 2> {log} + """ diff --git a/snakePipes/shared/rules/SNPsplit.snakefile b/snakePipes/shared/rules/SNPsplit.snakefile index 37169ab55..ea06f0f15 100644 --- a/snakePipes/shared/rules/SNPsplit.snakefile +++ b/snakePipes/shared/rules/SNPsplit.snakefile @@ -8,8 +8,8 @@ if aligner == "Bowtie2": output: targetbam = expand("allelic_bams/{{sample}}.filtered.{suffix}.bam", suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned']), tempbam = temp("filtered_bam/{sample}.filtered.sortedByName.bam"), - rep1 = "allelic_bams/{sample}.SNPsplit_report.yaml", - rep2 = "allelic_bams/{sample}.SNPsplit_sort.yaml" + rep1 = "allelic_bams/{sample}.filtered.SNPsplit_report.yaml", + rep2 = "allelic_bams/{sample}.filtered.SNPsplit_sort.yaml" log: "allelic_bams/logs/{sample}.snp_split.log" params: pairedEnd = '--paired' if pairedEnd else '', diff --git a/snakePipes/shared/rules/envs/chip_seacr.yaml b/snakePipes/shared/rules/envs/chip_seacr.yaml index aaa69050f..35bcc1e8d 100644 --- a/snakePipes/shared/rules/envs/chip_seacr.yaml +++ b/snakePipes/shared/rules/envs/chip_seacr.yaml @@ -6,3 +6,4 @@ channels: dependencies: - seacr - ucsc-bigwigtobedgraph + - bedtools diff --git a/snakePipes/shared/rules/multiQC.snakefile b/snakePipes/shared/rules/multiQC.snakefile index 619a05398..efc2df5b0 100755 --- a/snakePipes/shared/rules/multiQC.snakefile +++ b/snakePipes/shared/rules/multiQC.snakefile @@ -61,12 +61,12 @@ def multiqc_input_check(return_value): infiles.append( expand("Qualimap_qc/{sample}.filtered.bamqc_results.txt", sample = samples) ) indir += " Qualimap_qc " if "allelic-mapping" in mode: - infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) ) - infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) ) + infiles.append( expand("allelic_bams/{sample}.filtered.SNPsplit_report.yaml", sample = samples) ) + infiles.append( expand("allelic_bams/{sample}.filtered.SNPsplit_sort.yaml", sample = samples) ) indir += "allelic_bams" elif pipeline=="rna-seq": # must be RNA-mapping, add files as per the mode - if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode: + if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode and not "allelic-counting" in mode: infiles.append( expand(aligner+"/{sample}.bam", sample = samples) + expand("Sambamba/{sample}.markdup.txt", sample = samples) + expand("deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt",sample=samples)+ @@ -74,9 +74,10 @@ def multiqc_input_check(return_value): indir += aligner + " featureCounts " indir += " Sambamba " indir += " deepTools_qc " - if "allelic-mapping" in mode: + if "allelic-mapping" in mode or "allelic-counting" in mode: infiles.append( expand("featureCounts/{sample}.allelic_counts.txt", sample = samples) ) indir += aligner + " featureCounts " + if "allelic-mapping" in mode: infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) ) infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) ) indir += "allelic_bams" diff --git a/snakePipes/workflows/ATAC-seq/ATAC-seq b/snakePipes/workflows/ATAC-seq/ATAC-seq index 25e0d37f2..386f8b457 100755 --- a/snakePipes/workflows/ATAC-seq/ATAC-seq +++ b/snakePipes/workflows/ATAC-seq/ATAC-seq @@ -20,6 +20,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "reads": ["_R1", "_R2"], "maxFragmentSize": 150, "minFragmentSize": 0, "fromBAM": False ,"qval": 0.001 ,"sampleSheet": None, + "externalBed": None, "bamExt": "filtered.bam", "fdr": 0.05, "absBestLFC": 1, "UMIDedup": False, "UMIDedupOpts": "", "UMIDedupSep": "_", @@ -71,6 +72,10 @@ def parse_args(defaults={"verbose": False, "configFile": None, help="Invoke differential accessibility analysis by providing information on samples; see 'https://github.com/maxplanck-ie/snakepipes/tree/master/docs/content/sampleSheet.example.tsv' for example. IMPORTANT: The first entry defines which group of samples are control. With this, the order of comparison and likewise the sign of values can be changed! Also, the condition `control` should not be used (reserved to mark input samples in the ChIP-Seq workflow (default: '%(default)s').", default=defaults["sampleSheet"]) + optional.add_argument("--externalBed", + help="A bed file with intervals to be tested for differential binding. (default: '%(default)s')", + default=defaults["externalBed"]) + optional.add_argument("--fromBAM", dest="fromBAM", help="Input folder with bam files. If provided, the analysis will start from this point. (default: '%(default)s')", diff --git a/snakePipes/workflows/ATAC-seq/defaults.yaml b/snakePipes/workflows/ATAC-seq/defaults.yaml index 6bc3d2f65..64e332020 100644 --- a/snakePipes/workflows/ATAC-seq/defaults.yaml +++ b/snakePipes/workflows/ATAC-seq/defaults.yaml @@ -29,6 +29,7 @@ minFragmentSize: 0 verbose: false # sampleSheet_DB sampleSheet: +externalBed: # windowSize windowSize: 20 fragmentCountThreshold: 1 diff --git a/snakePipes/workflows/ATAC-seq/internals.snakefile b/snakePipes/workflows/ATAC-seq/internals.snakefile index 0908411b0..b26325039 100644 --- a/snakePipes/workflows/ATAC-seq/internals.snakefile +++ b/snakePipes/workflows/ATAC-seq/internals.snakefile @@ -84,3 +84,9 @@ if sampleSheet: reordered_dict[g] = {k: filtered_dict[k] for k in [item for sublist in genrichDict[g].values() for item in sublist]} else: genrichDict = {"all_samples": samples} + +if externalBed: + if os.path.isfile(externalBed): + peakCaller = os.path.splitext(os.path.basename(externalBed))[0] + else: + warnings.warn("{} file not found.".format(externalBed)) diff --git a/snakePipes/workflows/ChIP-seq/ChIP-seq b/snakePipes/workflows/ChIP-seq/ChIP-seq index 50d242a71..3e0d0d020 100755 --- a/snakePipes/workflows/ChIP-seq/ChIP-seq +++ b/snakePipes/workflows/ChIP-seq/ChIP-seq @@ -21,6 +21,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "pairedEnd": True, "fragmentLength":200, "bwBinSize": 25, "windowSize": 150, "predictChIPDict": None, "fromBAM": False, "bigWigType": "both", "peakCaller": "MACS2", "sampleSheet": "", + "externalBed": "", "plotFormat": "png", "bamExt": ".filtered.bam", "fdr": 0.05, "absBestLFC": 1, "useSpikeinForNorm": False, "spikeinExt": "_spikein", "peakCallerOptions": "--qvalue 0.001","cutntag": False, @@ -108,6 +109,10 @@ def parse_args(defaults={"verbose": False, "configFile": None, "are not evaluated for differential binding) (default: '%(default)s')", default=defaults["sampleSheet"]) + optional.add_argument("--externalBed", + help="A bed file with intervals to be tested for differential binding. (default: '%(default)s')", + default=defaults["externalBed"]) + optional.add_argument("--windowSize", help="Window size to counts reads in (If differential binding analysis required); " "Default size is suitable for most transcription factors and sharp histone marks. " diff --git a/snakePipes/workflows/ChIP-seq/Snakefile b/snakePipes/workflows/ChIP-seq/Snakefile index 6a5cfed45..213e65189 100755 --- a/snakePipes/workflows/ChIP-seq/Snakefile +++ b/snakePipes/workflows/ChIP-seq/Snakefile @@ -195,7 +195,7 @@ def run_genrich(): def run_macs2(): - if (peakCaller == "MACS2") or cutntag: + if peakCaller == "MACS2": if useSpikeInForNorm: file_list = expand("MACS2/{chip_sample}_host.BAM_peaks.xls", chip_sample = chip_samples) file_list.extend(expand("MACS2/{chip_sample}_host.BAM_peaks.qc.txt", chip_sample = chip_samples)) @@ -210,8 +210,10 @@ def run_seacr(): if (peakCaller == "SEACR"): if useSpikeInForNorm: file_list = expand("SEACR/{chip_sample}_host.stringent.bed", chip_sample = chip_samples) + file_list.append(expand("SEACR/{chip_sample}_host.stringent_peaks.qc.txt",chip_sample=chip_samples)) else: file_list = expand("SEACR/{chip_sample}.filtered.stringent.bed", chip_sample = chip_samples) + file_list.append(expand("SEACR/{chip_sample}.filtered.stringent_peaks.qc.txt",chip_sample=chip_samples)) return (file_list) return ([]) diff --git a/snakePipes/workflows/ChIP-seq/defaults.yaml b/snakePipes/workflows/ChIP-seq/defaults.yaml index 1d8f1fcd7..3608e0a67 100755 --- a/snakePipes/workflows/ChIP-seq/defaults.yaml +++ b/snakePipes/workflows/ChIP-seq/defaults.yaml @@ -44,6 +44,8 @@ fragmentLength: 200 verbose: false # sampleSheet_DB sampleSheet: +# externalBed for DB +externalBed: # windowSize windowSize: 150 #### Flag to control the pipeline entry point diff --git a/snakePipes/workflows/ChIP-seq/internals.snakefile b/snakePipes/workflows/ChIP-seq/internals.snakefile index ac95b5726..56877b89a 100755 --- a/snakePipes/workflows/ChIP-seq/internals.snakefile +++ b/snakePipes/workflows/ChIP-seq/internals.snakefile @@ -276,3 +276,9 @@ if useSpikeInForNorm: else: print("\n No spikein genome detected - no spikeIn chromosomes found with extention " + spikeinExt + " .\n\n") exit(1) + +if externalBed: + if os.path.isfile(externalBed): + peakCaller = os.path.splitext(os.path.basename(externalBed))[0] + else: + warnings.warn("{} file not found.".format(externalBed)) diff --git a/snakePipes/workflows/mRNA-seq/Snakefile b/snakePipes/workflows/mRNA-seq/Snakefile index 9d2c968f9..c4c97a8a9 100755 --- a/snakePipes/workflows/mRNA-seq/Snakefile +++ b/snakePipes/workflows/mRNA-seq/Snakefile @@ -27,8 +27,9 @@ include: os.path.join(maindir, "shared", "tools" , "deeptools_cmds.snakefile") ## filtered annotation (GTF) include: os.path.join(maindir, "shared", "rules", "filterGTF.snakefile") -## bamCoverage_RPKM -include: os.path.join(maindir, "shared", "rules", "deepTools_RNA.snakefile") +if not "allelic-counting" in mode: + ## bamCoverage_RPKM + include: os.path.join(maindir, "shared", "rules", "deepTools_RNA.snakefile") if not fromBAM: @@ -100,11 +101,14 @@ else: include: os.path.join(maindir, "shared", "rules", "SNPsplit.snakefile") # deepTools QC include: os.path.join(maindir, "shared", "rules", "deepTools_RNA_allelic.snakefile") + elif "allelic-counting" in mode: + allele_mode = "from_split_bam" + include: os.path.join(maindir, "shared", "rules", "deepTools_RNA_allelic.snakefile") include: os.path.join(maindir, "shared", "rules", "LinkBam.snakefile") -if "allelic-mapping" in mode: +if "allelic-mapping" in mode or "allelic-counting" in mode: ## featureCounts_allelic include: os.path.join(maindir, "shared", "rules", "featureCounts_allelic.snakefile") else: @@ -123,7 +127,7 @@ include:os.path.join(maindir, "shared", "rules", "RNA-seq_qc_report.snakefile") ## DESeq2 if sampleSheet and not "three-prime-seq" in mode: - if isMultipleComparison and not "allelic-mapping" in mode: + if isMultipleComparison and not "allelic-mapping" in mode and not "allelic-counting" in mode: include: os.path.join(maindir, "shared", "rules", "DESeq2.multipleComp.snakefile") if rMats: include: os.path.join(maindir, "shared", "rules", "rMats.multipleComp.snakefile") @@ -239,7 +243,7 @@ def make_nmasked_genome(): return([]) def run_allelesp_mapping(): - if "allelic-mapping" in mode: + if "allelic-mapping" in mode or "allelic-counting" in mode: allele_suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned'] file_list = [ expand("allelic_bams/{sample}.{suffix}.sorted.bam", sample = samples, diff --git a/snakePipes/workflows/mRNA-seq/defaults.yaml b/snakePipes/workflows/mRNA-seq/defaults.yaml index 849ffd722..f3142c8bf 100644 --- a/snakePipes/workflows/mRNA-seq/defaults.yaml +++ b/snakePipes/workflows/mRNA-seq/defaults.yaml @@ -64,6 +64,7 @@ clusterPAS: ## further options mode: alignment,deepTools_qc sampleSheet: +formula: rMats: False bwBinSize: 25 fastqc: False diff --git a/snakePipes/workflows/mRNA-seq/internals.snakefile b/snakePipes/workflows/mRNA-seq/internals.snakefile index 835802a80..93470fd77 100644 --- a/snakePipes/workflows/mRNA-seq/internals.snakefile +++ b/snakePipes/workflows/mRNA-seq/internals.snakefile @@ -37,7 +37,14 @@ if not fromBAM: pairedEnd = cf.is_paired(infiles, ext, reads) else: infiles = sorted(glob.glob(os.path.join(str(indir or ''), '*' + bamExt))) - samples = cf.get_sample_names_bam(infiles, bamExt) + if "allelic-counting" in mode: + samples = cf.get_sample_names_suffix_bam(infiles, bamExt) + else: + samples = cf.get_sample_names_bam(infiles, bamExt) + +if formula and not sampleSheet: + print("In order to apply custom formula, please provide a sample sheet!") + exit(1) if sampleSheet: cf.check_sample_info_header(sampleSheet) diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index d06eb9388..a02f4d012 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -14,6 +14,7 @@ import sys import textwrap import snakePipes.common_functions as cf import snakePipes.parserCommon as parserCommon +import warnings def parse_args(defaults={"verbose": False, "configFile": None, @@ -26,6 +27,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "alignerOptions": None, "featureCountsOptions": "-C -Q 10 --primary", "filterGTF": None, "sampleSheet": None, + "formula": None, "reads": ["_R1", "_R2"], "ext": ".fastq.gz", "bwBinSize": 25, "dnaContam": False, "plotFormat": "png", "fromBAM": False, "bamExt": ".bam", "pairedEnd": True, @@ -50,7 +52,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, # Workflow options optional = parser.add_argument_group('Options') optional.add_argument("-m", "--mode", - help="workflow running modes (available: 'alignment-free, alignment, allelic-mapping, deepTools_qc, three-prime-seq')" + help="workflow running modes (available: 'alignment-free, alignment, allelic-mapping, allelic-counting, deepTools_qc, three-prime-seq')" " (default: '%(default)s')", default=defaults["mode"]) @@ -91,6 +93,12 @@ def parse_args(defaults={"verbose": False, "configFile": None, " for that! (default: '%(default)s')", default=defaults["sampleSheet"]) + optional.add_argument("--formula", + dest="formula", + help="Design formula to use in linear model fit (default: '%(default)s')", + default=defaults["formula"]) + + optional.add_argument("--dnaContam", action="store_true", help="Returns a plot which presents the proportion of the intergenic reads (default: '%(default)s')", @@ -152,16 +160,24 @@ def main(): args.SNPfile = os.path.abspath(args.SNPfile) args.NMaskedIndex = os.path.abspath(args.NMaskedIndex) modeTemp = args.mode.split(",") - validModes = set(["alignment", "alignment-free", "deepTools_qc", "allelic-mapping","three-prime-seq"]) + validModes = set(["alignment", "alignment-free", "deepTools_qc", "allelic-mapping", "allelic-counting", "three-prime-seq"]) for mode in modeTemp: if mode not in validModes: sys.exit("{} is not a valid mode!\n".format(mode)) if "alignment" not in modeTemp and args.UMIDedup: sys.exit("UMIDedup is only valid for \"alignment\" mode!\n") + if "allelic-counting" in modeTemp and "deepTools_qc" in modeTemp: + sys.exit("Mode deepTools_qc is not compatible with mode allelic-counting.") if args.fromBAM and ("alignment-free" in modeTemp ): - sys.exit("\n--fromBAM can only be used with modes \'alignment\' , \'allelic-mapping\' or \'deepTools_qc\' - use one of these modes or provide fastq files!\n") + sys.exit("\n--fromBAM can only be used with modes \'alignment\' , \'allelic-mapping\' , \'allelic-counting\' or \'deepTools_qc\' - use one of these modes or provide fastq files!\n") if args.fromBAM: args.aligner = "EXTERNAL_BAM" + if "allelic-counting" in modeTemp and not args.fromBAM: + warnings.warn("--fromBAM is required with allelic-counting mode. Setting to True.") + args.fromBAM = True + if "allelic-counting" in modeTemp: + args.bamExt = ".sorted.bam" + args.aligner = "allelic_bams" if args.rMats and not args.sampleSheet: sys.exit("--rMats flag requires a sampleSheet (specified with --sampleSheet).\n") if "three_prime_seq" in mode: diff --git a/tests/test_jobcounts.py b/tests/test_jobcounts.py index 67f429cb3..242eb3951 100644 --- a/tests/test_jobcounts.py +++ b/tests/test_jobcounts.py @@ -585,7 +585,7 @@ def test_seacr(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 71 + assert parseSpOut(_p) == 77 def test_seacr_spikein(self, ifs): ci = [ "ChIP-seq", @@ -604,7 +604,7 @@ def test_seacr_spikein(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 112 + assert parseSpOut(_p) == 118 def test_SE(self, ifs): ci = [ "ChIP-seq", @@ -691,7 +691,7 @@ def test_seacr_noInput(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 44 + assert parseSpOut(_p) == 50 def test_seacr_spikein_noInput(self, ifs): ci = [ "ChIP-seq", @@ -710,7 +710,7 @@ def test_seacr_spikein_noInput(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 65 + assert parseSpOut(_p) == 71 def test_frombam(self, ifs): ci = [ "ChIP-seq",