From 1102c13e2ca835a0ed0b9ed5ee3d1e3c53cdb6ca Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 20 Feb 2024 11:51:27 +0100 Subject: [PATCH 01/66] mRNAseq --- snakePipes/workflows/mRNA-seq/defaults.yaml | 2 ++ snakePipes/workflows/mRNA-seq/mRNA-seq | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/snakePipes/workflows/mRNA-seq/defaults.yaml b/snakePipes/workflows/mRNA-seq/defaults.yaml index 849ffd722..2a06d5520 100644 --- a/snakePipes/workflows/mRNA-seq/defaults.yaml +++ b/snakePipes/workflows/mRNA-seq/defaults.yaml @@ -64,6 +64,8 @@ clusterPAS: ## further options mode: alignment,deepTools_qc sampleSheet: +formula: +contrast: rMats: False bwBinSize: 25 fastqc: False diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index d06eb9388..a8f59a005 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -26,6 +26,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "alignerOptions": None, "featureCountsOptions": "-C -Q 10 --primary", "filterGTF": None, "sampleSheet": None, + "formula": None. "contrast": None, "reads": ["_R1", "_R2"], "ext": ".fastq.gz", "bwBinSize": 25, "dnaContam": False, "plotFormat": "png", "fromBAM": False, "bamExt": ".bam", "pairedEnd": True, @@ -91,6 +92,22 @@ def parse_args(defaults={"verbose": False, "configFile": None, " for that! (default: '%(default)s')", default=defaults["sampleSheet"]) + optional.add_argument("--formula", + dest="formula", + metavar="STR", + action='store', + type=str, + help="Design formula to use in linear model fit (default: '%(default)s')", + default=defaults["formula"]) + + optional.add_argument("--contrast", + dest="contrast", + metavar="STR", + action='store', + type=str, + help="Contrast to extract the coefficients for (default: '%(default)s')", + default=defaults["contrast"]) + optional.add_argument("--dnaContam", action="store_true", help="Returns a plot which presents the proportion of the intergenic reads (default: '%(default)s')", From 53ba7876cf1e2b014e83a0c89a1dc9a6ad75e4a4 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 20 Feb 2024 11:53:34 +0100 Subject: [PATCH 02/66] internals --- snakePipes/workflows/mRNA-seq/internals.snakefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snakePipes/workflows/mRNA-seq/internals.snakefile b/snakePipes/workflows/mRNA-seq/internals.snakefile index 835802a80..a69b0e580 100644 --- a/snakePipes/workflows/mRNA-seq/internals.snakefile +++ b/snakePipes/workflows/mRNA-seq/internals.snakefile @@ -39,6 +39,10 @@ else: infiles = sorted(glob.glob(os.path.join(str(indir or ''), '*' + bamExt))) samples = cf.get_sample_names_bam(infiles, bamExt) +if formula and not sampleSheet or contrast and not sampleSheet: + print("In order to apply custom formula or contrast, please provide a sample sheet!") + exit(1) + if sampleSheet: cf.check_sample_info_header(sampleSheet) isMultipleComparison = cf.isMultipleComparison(sampleSheet) From ab54fc5364fd7ff1d0ce5bda7dffc1854ee84454 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 20 Feb 2024 11:58:47 +0100 Subject: [PATCH 03/66] removed contrast --- snakePipes/shared/rules/DESeq2.singleComp.snakefile | 5 ++++- snakePipes/workflows/mRNA-seq/defaults.yaml | 1 - snakePipes/workflows/mRNA-seq/internals.snakefile | 4 ++-- snakePipes/workflows/mRNA-seq/mRNA-seq | 9 +-------- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index c6822e41b..f03bb344f 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -21,7 +21,9 @@ rule DESeq2: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = formula, + contrast = contrast log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) @@ -37,6 +39,7 @@ rule DESeq2: "{params.allele_info} " # 6 "{params.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 + "{params.formula}" #9 " > ../{log.out} 2> ../{log.err}" diff --git a/snakePipes/workflows/mRNA-seq/defaults.yaml b/snakePipes/workflows/mRNA-seq/defaults.yaml index 2a06d5520..f3142c8bf 100644 --- a/snakePipes/workflows/mRNA-seq/defaults.yaml +++ b/snakePipes/workflows/mRNA-seq/defaults.yaml @@ -65,7 +65,6 @@ clusterPAS: mode: alignment,deepTools_qc sampleSheet: formula: -contrast: rMats: False bwBinSize: 25 fastqc: False diff --git a/snakePipes/workflows/mRNA-seq/internals.snakefile b/snakePipes/workflows/mRNA-seq/internals.snakefile index a69b0e580..f9e3c6ba8 100644 --- a/snakePipes/workflows/mRNA-seq/internals.snakefile +++ b/snakePipes/workflows/mRNA-seq/internals.snakefile @@ -39,8 +39,8 @@ else: infiles = sorted(glob.glob(os.path.join(str(indir or ''), '*' + bamExt))) samples = cf.get_sample_names_bam(infiles, bamExt) -if formula and not sampleSheet or contrast and not sampleSheet: - print("In order to apply custom formula or contrast, please provide a sample sheet!") +if formula and not sampleSheet: + print("In order to apply custom formula, please provide a sample sheet!") exit(1) if sampleSheet: diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index a8f59a005..dd8c44a40 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -26,7 +26,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "alignerOptions": None, "featureCountsOptions": "-C -Q 10 --primary", "filterGTF": None, "sampleSheet": None, - "formula": None. "contrast": None, + "formula": None, "reads": ["_R1", "_R2"], "ext": ".fastq.gz", "bwBinSize": 25, "dnaContam": False, "plotFormat": "png", "fromBAM": False, "bamExt": ".bam", "pairedEnd": True, @@ -100,13 +100,6 @@ def parse_args(defaults={"verbose": False, "configFile": None, help="Design formula to use in linear model fit (default: '%(default)s')", default=defaults["formula"]) - optional.add_argument("--contrast", - dest="contrast", - metavar="STR", - action='store', - type=str, - help="Contrast to extract the coefficients for (default: '%(default)s')", - default=defaults["contrast"]) optional.add_argument("--dnaContam", action="store_true", From a07b534f964864635f3b6ade778e8e4d2ccf8679 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 20 Feb 2024 12:08:44 +0100 Subject: [PATCH 04/66] rules and rscript --- snakePipes/shared/rscripts/DESeq2.R | 6 +++++- snakePipes/shared/rscripts/DE_functions.R | 4 ++-- snakePipes/shared/rules/DESeq2.multipleComp.snakefile | 8 ++++++-- snakePipes/shared/rules/DESeq2.singleComp.snakefile | 7 ++++--- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 4f76297b1..45ecae95f 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -9,6 +9,7 @@ # args 5 : path to DE_functions # args 6 : T/F whether or not the workflow is allele-sepecific # args 7 : tx2gene file for salmon --> DESeq mode +# args 9: model formula .libPaths(R.home("library")) @@ -31,6 +32,9 @@ if(file.exists(tx2gene_file)) { } rmdTemplate <- args[8] + +formulaInput <- args[9] + topN <- 50 ## include functions suppressPackageStartupMessages(library(ggplot2)) @@ -96,7 +100,7 @@ if(length(unique(sampleInfo$condition))>1){ if(tximport & allelic_info){ message("Detected allelic Salmon counts. Skipping DESeq_basic.") }else{ - seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport) + seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport, custom_formula = formulaInput) DESeq_writeOutput(DEseqout = seqout, fdr = fdr, outprefix = "DEseq_basic", diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index de55cab74..b644f15a5 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -92,9 +92,9 @@ checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, #' #' -DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA) { +DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA, customFormula=NA) { cnames.sub<-unique(colnames(coldata)[2:which(colnames(coldata) %in% "condition")]) - d<-as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))) + d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+"))),d<-as.formula(customFormula)) # Normal DESeq diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index f60a60182..5a84ce904 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -34,7 +34,8 @@ rule DESeq2: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = formula log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) @@ -50,6 +51,7 @@ rule DESeq2: "{params.allele_info} " # 6 "{params.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 + "{params.formula}" # 9 " > ../{log.out} 2> ../{log.err}" @@ -75,7 +77,8 @@ rule DESeq2_Salmon_basic: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = formula conda: CONDA_RNASEQ_ENV shell: "cd {params.outdir} && " @@ -88,4 +91,5 @@ rule DESeq2_Salmon_basic: "{params.allele_info} " # 6 "../{input.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 + "{params.formula}" " > ../{log.out} 2> ../{log.err}" diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index f03bb344f..6d892abb6 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -22,8 +22,7 @@ rule DESeq2: allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = formula, - contrast = contrast + formula = formula log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) @@ -64,7 +63,8 @@ rule DESeq2_Salmon_basic: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")a, + formula = formula conda: CONDA_RNASEQ_ENV shell: "cd {params.outdir} && " @@ -77,6 +77,7 @@ rule DESeq2_Salmon_basic: "{params.allele_info} " # 6 "../{input.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 + "{params.formula}" # 9 " > ../{log.out} 2> ../{log.err}" rule DESeq2_Salmon_allelic: From b9f0aaef8ddca308d7e87001b09e13e3814fb971 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 20 Feb 2024 14:36:39 +0100 Subject: [PATCH 05/66] fix coef for apeglm --- snakePipes/shared/rscripts/DE_functions.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index b644f15a5..232a76309 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -123,9 +123,8 @@ DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_sa } dds <- DESeq2::DESeq(dds) ddr <- DESeq2::results(dds, alpha = fdr) - c1<-unique(coldata$condition)[1] - c2<-unique(coldata$condition)[2] - ddr_shrunk <- DESeq2::lfcShrink(dds,coef=paste0("condition_",c2,"_vs_",c1),type="apeglm",res=ddr) + auto_coef<-resultsNames(dds)[length(resultsNames(dds))] + ddr_shrunk <- DESeq2::lfcShrink(dds,coef=auto_coef,type="apeglm",res=ddr) output <- list(dds = dds, ddr = ddr, ddr_shrunk = ddr_shrunk) return(output) From c9af642dc26d753f04f293f3158bdbf430137632 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 21 Feb 2024 12:15:55 +0100 Subject: [PATCH 06/66] fix de fun --- snakePipes/shared/rscripts/DE_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index 232a76309..d020f975a 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -94,7 +94,7 @@ checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA, customFormula=NA) { cnames.sub<-unique(colnames(coldata)[2:which(colnames(coldata) %in% "condition")]) - d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+"))),d<-as.formula(customFormula)) + d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))),as.formula(customFormula)) # Normal DESeq From 3f188272f5ea735866dd70fcb72dd0317eea8d65 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 21 Feb 2024 12:27:30 +0100 Subject: [PATCH 07/66] fix deseq2 --- snakePipes/shared/rscripts/DESeq2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 45ecae95f..4757e44a4 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -100,7 +100,7 @@ if(length(unique(sampleInfo$condition))>1){ if(tximport & allelic_info){ message("Detected allelic Salmon counts. Skipping DESeq_basic.") }else{ - seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport, custom_formula = formulaInput) + seqout <- DESeq_basic(countdata, coldata = sampleInfo, fdr = fdr, alleleSpecific = allelic_info, from_salmon = tximport, customFormula = formulaInput) DESeq_writeOutput(DEseqout = seqout, fdr = fdr, outprefix = "DEseq_basic", From 5ae2128a319b7a09f0f80afcf0a094ccaca7205a Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 21 Feb 2024 12:36:01 +0100 Subject: [PATCH 08/66] test --- snakePipes/shared/rscripts/DESeq2.R | 1 + snakePipes/shared/rules/DESeq2.singleComp.snakefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 4757e44a4..a51d07504 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -53,6 +53,7 @@ cat(paste("Working dir:", getwd(), "\n")) cat(paste("Sample info CSV:", sampleInfoFilePath, "\n")) cat(paste("Count file:", countFilePath, "\n")) cat(paste("FDR:", fdr, "\n")) +cat(paste("Custom formula:",formulaInput,"\n")) cat(paste("Gene names:", geneNamesFilePath, "\n")) cat(paste("Number of top N genes:", topN, "\n")) cat(paste("Salmon --> DESeq2 : ", tximport, "\n")) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 6d892abb6..658217aa1 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -63,7 +63,7 @@ rule DESeq2_Salmon_basic: importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")a, + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), formula = formula conda: CONDA_RNASEQ_ENV shell: From 64405ea5e46c6271fa9c758f142401277eeb2393 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 21 Feb 2024 12:44:07 +0100 Subject: [PATCH 09/66] add tilda on the fly --- snakePipes/shared/rscripts/DE_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index d020f975a..ca2fceaa4 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -94,7 +94,7 @@ checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA, customFormula=NA) { cnames.sub<-unique(colnames(coldata)[2:which(colnames(coldata) %in% "condition")]) - d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))),as.formula(customFormula)) + d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))),as.formula(paste0("~",customFormula))) # Normal DESeq From ee7a35c71fea31b355b21da21d8afea6f8dd1441 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 21 Feb 2024 13:05:00 +0100 Subject: [PATCH 10/66] minimal changes --- snakePipes/shared/rules/DESeq2.multipleComp.snakefile | 4 ++-- snakePipes/shared/rules/DESeq2.singleComp.snakefile | 4 ++-- snakePipes/workflows/mRNA-seq/mRNA-seq | 3 --- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index 5a84ce904..3abc58845 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -51,7 +51,7 @@ rule DESeq2: "{params.allele_info} " # 6 "{params.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 - "{params.formula}" # 9 + "{params.formula} " # 9 " > ../{log.out} 2> ../{log.err}" @@ -91,5 +91,5 @@ rule DESeq2_Salmon_basic: "{params.allele_info} " # 6 "../{input.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 - "{params.formula}" + "{params.formula} " " > ../{log.out} 2> ../{log.err}" diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 658217aa1..0c4a4e699 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -38,7 +38,7 @@ rule DESeq2: "{params.allele_info} " # 6 "{params.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 - "{params.formula}" #9 + "{params.formula} " #9 " > ../{log.out} 2> ../{log.err}" @@ -77,7 +77,7 @@ rule DESeq2_Salmon_basic: "{params.allele_info} " # 6 "../{input.tx2gene_file} " # 7 "{params.rmdTemplate} " # 8 - "{params.formula}" # 9 + "{params.formula} " # 9 " > ../{log.out} 2> ../{log.err}" rule DESeq2_Salmon_allelic: diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index dd8c44a40..80f369abd 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -94,9 +94,6 @@ def parse_args(defaults={"verbose": False, "configFile": None, optional.add_argument("--formula", dest="formula", - metavar="STR", - action='store', - type=str, help="Design formula to use in linear model fit (default: '%(default)s')", default=defaults["formula"]) From 3c23666fc0fccb2c484bfd738bc3d983372173c7 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Thu, 22 Feb 2024 10:16:44 +0100 Subject: [PATCH 11/66] wip --- snakePipes/shared/rscripts/DESeq2.R | 31 +++++++++++++------ .../rules/DESeq2.multipleComp.snakefile | 31 ++++++++++--------- .../shared/rules/DESeq2.singleComp.snakefile | 4 +-- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index a51d07504..576b7d4b6 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -13,27 +13,38 @@ .libPaths(R.home("library")) -args = commandArgs(TRUE) +#args = commandArgs(TRUE) ## re invented RNaseq workflow -sampleInfoFilePath <- args[1] -countFilePath <- args[2] -fdr <- as.numeric(args[3]) -geneNamesFilePath <- args[4] -importfunc <- args[5] -allelic_info <- as.logical(args[6]) +#sampleInfoFilePath <- args[1] +#countFilePath <- args[2] +#fdr <- as.numeric(args[3]) +#geneNamesFilePath <- args[4] +#importfunc <- args[5] +#allelic_info <- as.logical(args[6]) ## if output is from salmon then tx2gene file should be present -tx2gene_file <- args[7] +#tx2gene_file <- args[7] + +sampleInfoFilePath <- snakemake@params[["sampleSheet"]] +countFilePath <- snakemake@input[["counts_table"]] +fdr <- as.numeric(snakemake@params[["fdr"]]) +geneNamesFilePath <- snakemake@input[["symbol_file"]] +importfunc <- snakemake@params[["importfunc"]] +allelic_info <- as.logical(snakemake@params[["allele_info"]]) +tx2gene_file <- snakemake@params[["tx2gene_file"]] +rmdTemplate <- snakemake@params[["rmdTemplate"]] +formulaInput <- snakemake@params[["formula"]] + if(file.exists(tx2gene_file)) { tximport <- TRUE } else { tximport <- FALSE } -rmdTemplate <- args[8] +#rmdTemplate <- args[8] -formulaInput <- args[9] +#formulaInput <- args[9] topN <- 50 ## include functions diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index 3abc58845..3208a208e 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -35,24 +35,25 @@ rule DESeq2: allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = formula + formula = config["formula"] log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{params.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "{params.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - "{params.formula} " # 9 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" + #shell: + # "cd {params.outdir} && " + # "Rscript {params.script} " + # "{params.sampleSheet} " # 1 + # "../{input.counts_table} " # 2 + # "{params.fdr} " # 3 + # "../{input.symbol_file} " # 4 + # "{params.importfunc} " # 5 + # "{params.allele_info} " # 6 + # "{params.tx2gene_file} " # 7 + # "{params.rmdTemplate} " # 8 + # "{params.formula} " # 9 + # " > ../{log.out} 2> ../{log.err}" ## DESeq2 (on Salmon) @@ -78,7 +79,7 @@ rule DESeq2_Salmon_basic: allele_info = 'FALSE', tx2gene_file = "Annotation/genes.filtered.t2g", rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = formula + formula = config["formula"] conda: CONDA_RNASEQ_ENV shell: "cd {params.outdir} && " diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 0c4a4e699..3464a52be 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -22,7 +22,7 @@ rule DESeq2: allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = formula + formula = config["formula"] log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) @@ -64,7 +64,7 @@ rule DESeq2_Salmon_basic: allele_info = 'FALSE', tx2gene_file = "Annotation/genes.filtered.t2g", rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = formula + formula = config["formula"] conda: CONDA_RNASEQ_ENV shell: "cd {params.outdir} && " From 4dfc7fe2859a06bc92f52f98a48643b28a9d9139 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Thu, 22 Feb 2024 15:34:40 +0100 Subject: [PATCH 12/66] DESeq2 multicomp working --- snakePipes/shared/rscripts/DESeq2.R | 6 +++++- snakePipes/shared/rscripts/DE_functions.R | 11 +++++++++-- .../shared/rules/DESeq2.multipleComp.snakefile | 17 ++--------------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 576b7d4b6..9d1255162 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -27,7 +27,7 @@ #tx2gene_file <- args[7] sampleInfoFilePath <- snakemake@params[["sampleSheet"]] -countFilePath <- snakemake@input[["counts_table"]] +countFilePath <- snakemake@params[["counts_table"]] fdr <- as.numeric(snakemake@params[["fdr"]]) geneNamesFilePath <- snakemake@input[["symbol_file"]] importfunc <- snakemake@params[["importfunc"]] @@ -35,6 +35,10 @@ allelic_info <- as.logical(snakemake@params[["allele_info"]]) tx2gene_file <- snakemake@params[["tx2gene_file"]] rmdTemplate <- snakemake@params[["rmdTemplate"]] formulaInput <- snakemake@params[["formula"]] +wdir <- snakemake@params[["outdir"]] + +setwd(wdir) + if(file.exists(tx2gene_file)) { tximport <- TRUE diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R index ca2fceaa4..0395465d1 100644 --- a/snakePipes/shared/rscripts/DE_functions.R +++ b/snakePipes/shared/rscripts/DE_functions.R @@ -94,7 +94,13 @@ checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_salmon = FALSE, size_factors=NA, customFormula=NA) { cnames.sub<-unique(colnames(coldata)[2:which(colnames(coldata) %in% "condition")]) - d<-ifelse(is.na(customFormula),as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))),as.formula(paste0("~",customFormula))) + + if(is.na(customFormula)){ + d<-as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+")))) + } else { + + d<-as.formula(paste0("~",customFormula)) + } # Normal DESeq @@ -123,7 +129,8 @@ DESeq_basic <- function(countdata, coldata, fdr, alleleSpecific = FALSE, from_sa } dds <- DESeq2::DESeq(dds) ddr <- DESeq2::results(dds, alpha = fdr) - auto_coef<-resultsNames(dds)[length(resultsNames(dds))] + a <- DESeq2::resultsNames(dds) + auto_coef <- a[length(a)] ddr_shrunk <- DESeq2::lfcShrink(dds,coef=auto_coef,type="apeglm",res=ddr) output <- list(dds = dds, ddr = ddr, ddr_shrunk = ddr_shrunk) return(output) diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index 3208a208e..68b13562c 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -35,26 +35,13 @@ rule DESeq2: allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = config["formula"] + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) conda: CONDA_RNASEQ_ENV script: "{params.script}" - #shell: - # "cd {params.outdir} && " - # "Rscript {params.script} " - # "{params.sampleSheet} " # 1 - # "../{input.counts_table} " # 2 - # "{params.fdr} " # 3 - # "../{input.symbol_file} " # 4 - # "{params.importfunc} " # 5 - # "{params.allele_info} " # 6 - # "{params.tx2gene_file} " # 7 - # "{params.rmdTemplate} " # 8 - # "{params.formula} " # 9 - # " > ../{log.out} 2> ../{log.err}" - ## DESeq2 (on Salmon) rule DESeq2_Salmon_basic: From dcc032706870ff5ebf26497e4cd9845609c829af Mon Sep 17 00:00:00 2001 From: WardDeb Date: Thu, 7 Mar 2024 16:33:04 +0100 Subject: [PATCH 13/66] update help on NMaskedIndex, untemp unphased allelic bams --- snakePipes/parserCommon.py | 3 ++- snakePipes/shared/rules/Bowtie2_allelic.snakefile | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/snakePipes/parserCommon.py b/snakePipes/parserCommon.py index 0042ce5da..cf0280d43 100644 --- a/snakePipes/parserCommon.py +++ b/snakePipes/parserCommon.py @@ -168,7 +168,8 @@ def snpArguments(defaults): snpargs.add_argument("--NMaskedIndex", default='', - help="N-masked index of the reference genome (default: 'None')") + help="N-masked index of the reference genome (default: 'None'). " + "Note that this should point to a file (i.e. 'Genome' for STAR indices, genome.1.bt2 for bowtie2 indices).") return parser diff --git a/snakePipes/shared/rules/Bowtie2_allelic.snakefile b/snakePipes/shared/rules/Bowtie2_allelic.snakefile index 97756d09b..12854c3cd 100755 --- a/snakePipes/shared/rules/Bowtie2_allelic.snakefile +++ b/snakePipes/shared/rules/Bowtie2_allelic.snakefile @@ -17,7 +17,7 @@ if aligner == "Bowtie2": index = bowtie2_index_allelic output: align_summary = aligner+"/{sample}.Bowtie2_summary.txt", - bam = temp(aligner+"/{sample}.sorted.bam") + bam = aligner+"/{sample}.sorted.bam" log: "Bowtie2/logs/{sample}.sort.log" params: alignerOpts = str(alignerOpts or ''), @@ -51,7 +51,7 @@ if aligner == "Bowtie2": index = bowtie2_index_allelic output: align_summary = aligner+"/{sample}.Bowtie2_summary.txt", - bam = temp(aligner+"/{sample}.sorted.bam") + bam = aligner+"/{sample}.sorted.bam" log: "Bowtie2/logs/{sample}.sort.log" params: alignerOpts = str(alignerOpts or ''), From 1b5dfb833ab91d27816a010302f40203cda2b837 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Thu, 7 Mar 2024 16:59:33 +0100 Subject: [PATCH 14/66] change numbers in dag test due to temp omission --- .ci_stuff/test_dag.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index f4f4bcf74..4716d1c87 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -175,11 +175,11 @@ WC=`DNA-mapping -i PE_input -o output .ci_stuff/organism.yaml --snakemakeOptions if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1402 ]; then exit 1 ; fi #allelic WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1,strain2 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2466 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2457 ]; then exit 1 ; fi WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --SNPfile allelic_input/snpfile.txt --NMaskedIndex allelic_input/Ngenome .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2445 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2436 ]; then exit 1 ; fi WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2466 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2457 ]; then exit 1 ; fi # ChIP-seq WC=`ChIP-seq -d BAM_input --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` From 44e828616883b533c4a2d2f574a732d3018dc7c1 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 8 Mar 2024 11:03:22 +0100 Subject: [PATCH 15/66] bowtie2 - allelic mode actually returns .filtered. reports, update multiqc report as well --- snakePipes/shared/rules/SNPsplit.snakefile | 4 ++-- snakePipes/shared/rules/multiQC.snakefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snakePipes/shared/rules/SNPsplit.snakefile b/snakePipes/shared/rules/SNPsplit.snakefile index 37169ab55..ea06f0f15 100644 --- a/snakePipes/shared/rules/SNPsplit.snakefile +++ b/snakePipes/shared/rules/SNPsplit.snakefile @@ -8,8 +8,8 @@ if aligner == "Bowtie2": output: targetbam = expand("allelic_bams/{{sample}}.filtered.{suffix}.bam", suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned']), tempbam = temp("filtered_bam/{sample}.filtered.sortedByName.bam"), - rep1 = "allelic_bams/{sample}.SNPsplit_report.yaml", - rep2 = "allelic_bams/{sample}.SNPsplit_sort.yaml" + rep1 = "allelic_bams/{sample}.filtered.SNPsplit_report.yaml", + rep2 = "allelic_bams/{sample}.filtered.SNPsplit_sort.yaml" log: "allelic_bams/logs/{sample}.snp_split.log" params: pairedEnd = '--paired' if pairedEnd else '', diff --git a/snakePipes/shared/rules/multiQC.snakefile b/snakePipes/shared/rules/multiQC.snakefile index 619a05398..308cbc060 100755 --- a/snakePipes/shared/rules/multiQC.snakefile +++ b/snakePipes/shared/rules/multiQC.snakefile @@ -61,8 +61,8 @@ def multiqc_input_check(return_value): infiles.append( expand("Qualimap_qc/{sample}.filtered.bamqc_results.txt", sample = samples) ) indir += " Qualimap_qc " if "allelic-mapping" in mode: - infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) ) - infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) ) + infiles.append( expand("allelic_bams/{sample}.filtered.SNPsplit_report.yaml", sample = samples) ) + infiles.append( expand("allelic_bams/{sample}.filtered.SNPsplit_sort.yaml", sample = samples) ) indir += "allelic_bams" elif pipeline=="rna-seq": # must be RNA-mapping, add files as per the mode From ae536b9602b0d0ee21fe9df47df0fdbfe911f97e Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 8 Mar 2024 12:13:09 +0100 Subject: [PATCH 16/66] revert temping original bam file --- .ci_stuff/test_dag.sh | 6 +++--- snakePipes/shared/rules/Bowtie2_allelic.snakefile | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index 4716d1c87..f4f4bcf74 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -175,11 +175,11 @@ WC=`DNA-mapping -i PE_input -o output .ci_stuff/organism.yaml --snakemakeOptions if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1402 ]; then exit 1 ; fi #allelic WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1,strain2 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2457 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2466 ]; then exit 1 ; fi WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --SNPfile allelic_input/snpfile.txt --NMaskedIndex allelic_input/Ngenome .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2436 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2445 ]; then exit 1 ; fi WC=`DNA-mapping -m allelic-mapping -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2457 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2466 ]; then exit 1 ; fi # ChIP-seq WC=`ChIP-seq -d BAM_input --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` diff --git a/snakePipes/shared/rules/Bowtie2_allelic.snakefile b/snakePipes/shared/rules/Bowtie2_allelic.snakefile index 12854c3cd..97756d09b 100755 --- a/snakePipes/shared/rules/Bowtie2_allelic.snakefile +++ b/snakePipes/shared/rules/Bowtie2_allelic.snakefile @@ -17,7 +17,7 @@ if aligner == "Bowtie2": index = bowtie2_index_allelic output: align_summary = aligner+"/{sample}.Bowtie2_summary.txt", - bam = aligner+"/{sample}.sorted.bam" + bam = temp(aligner+"/{sample}.sorted.bam") log: "Bowtie2/logs/{sample}.sort.log" params: alignerOpts = str(alignerOpts or ''), @@ -51,7 +51,7 @@ if aligner == "Bowtie2": index = bowtie2_index_allelic output: align_summary = aligner+"/{sample}.Bowtie2_summary.txt", - bam = aligner+"/{sample}.sorted.bam" + bam = temp(aligner+"/{sample}.sorted.bam") log: "Bowtie2/logs/{sample}.sort.log" params: alignerOpts = str(alignerOpts or ''), From 1d9860557caa83e919a5f6e6afc51ad8b9d8192c Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 25 Mar 2024 13:18:47 +0100 Subject: [PATCH 17/66] chipseq added seacr peak qc --- .../shared/rules/ChIP_peak_calling.snakefile | 33 +++++++++++++++++ .../rules/ChIP_peak_calling_spikein.snakefile | 36 +++++++++++++++++++ snakePipes/workflows/ChIP-seq/Snakefile | 2 ++ 3 files changed, 71 insertions(+) diff --git a/snakePipes/shared/rules/ChIP_peak_calling.snakefile b/snakePipes/shared/rules/ChIP_peak_calling.snakefile index 687a4c708..5caf12382 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling.snakefile @@ -260,3 +260,36 @@ rule SEACR_peaks: shell: """ bash {params.script} {input.chip} {input.control} {params.fdr} "norm" "stringent" {params.prefix} 2>{log} """ + +rule SEACR_peak_qc: + input: + bam = "filtered_bam/{sample}.filtered.bam", + peaks = "SEACR/{sample}.filtered.stringent.bed" + output: + qc = "SEACR/{sample}.filtered.stringent_peaks.qc.txt" + params: + genome_index = genome_index + benchmark: + "SEACR/.benchmark/SEACR_peak_qc.{sample}.filtered.benchmark" + conda: CONDA_SHARED_ENV + shell: """ + # get the number of peaks + peak_count=`wc -l < {input.peaks}` + + # get the number of mapped reads + mapped_reads=`samtools view -c -F 4 {input.bam}` + + #calculate the number of alignments overlapping the peaks + # exclude reads flagged as unmapped (unmapped reads will be reported when using -L) + reads_in_peaks=`samtools view -c -F 4 -L {input.peaks} {input.bam}` + + # calculate Fraction of Reads In Peaks + frip=`bc -l <<< "$reads_in_peaks/$mapped_reads"` + # compute peak genome coverage + peak_len=`awk '{{total+=$3-$2}}END{{print total}}' {input.peaks}` + genome_size=`awk '{{total+=$3-$2}}END{{print total}}' {params.genome_index}` + genomecov=`bc -l <<< "$peak_len/$genome_size"` + + # write peak-based QC metrics to output file + printf "peak_count\tFRiP\tpeak_genome_coverage\n%d\t%5.3f\t%6.4f\n" $peak_count $frip $genomecov > {output.qc} + """ diff --git a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile index 0120984ce..f2c5d1908 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile @@ -264,3 +264,39 @@ rule SEACR_peaks: shell: """ bash {params.script} {input.chip} {input.control} {params.fdr} "non" "stringent" {params.prefix} 2>{log} """ + + +rule SEACR_peak_qc: + input: + bam = "split_bam/{sample}_host.bam", + peaks = "SEACR/{sample}_host.stringent.bed" + output: + qc = "SEACR/{sample}_host.stringent_peaks.qc.txt" + params: + genome_index = genome_index + benchmark: + "SEACR/.benchmark/SEACR_peak_qc.{sample}_host.benchmark" + conda: CONDA_SHARED_ENV + shell: """ + # get the number of peaks + peak_count=`wc -l < {input.peaks}` + + #get the number of mapped reads + mapped_reads=`samtools view -c -F 4 {input.bam}` + + #calculate the number of alignments overlapping the peaks + # exclude reads flagged as unmapped (unmapped reads will be reported when using -L) + reads_in_peaks=`samtools view -c -F 4 -L {input.peaks} {input.bam}` + + # calculate Fraction of Reads In Peaks + frip=`bc -l <<< "$reads_in_peaks/$mapped_reads"` + # compute peak genome coverage + peak_len=`awk '{{total+=$3-$2}}END{{print total}}' {input.peaks}` + genome_size=`awk '{{total+=$3-$2}}END{{print total}}' {params.genome_index}` + genomecov=`bc -l <<< "$peak_len/$genome_size"` + + # write peak-based QC metrics to output file + printf "peak_count\tFRiP\tpeak_genome_coverage\n%d\t%5.3f\t%6.4f\n" $peak_count $frip $genomecov > {output.qc} + """ + + diff --git a/snakePipes/workflows/ChIP-seq/Snakefile b/snakePipes/workflows/ChIP-seq/Snakefile index 6a5cfed45..1cb6d6185 100755 --- a/snakePipes/workflows/ChIP-seq/Snakefile +++ b/snakePipes/workflows/ChIP-seq/Snakefile @@ -210,8 +210,10 @@ def run_seacr(): if (peakCaller == "SEACR"): if useSpikeInForNorm: file_list = expand("SEACR/{chip_sample}_host.stringent.bed", chip_sample = chip_samples) + file_list.append(expand("SEACR/{chip_sample}.host.stringent_peaks.qc.txt",chip_sample=chip_samples)) else: file_list = expand("SEACR/{chip_sample}.filtered.stringent.bed", chip_sample = chip_samples) + file_list.append(expand("SEACR/{chip_sample}.filtered.stringent_peaks.qc.txt",chip_sample=chip_samples)) return (file_list) return ([]) From 0dc24b650d7ce0b6af86bf198928df4747279183 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 25 Mar 2024 13:20:16 +0100 Subject: [PATCH 18/66] Ronald's fix for nearestGene --- snakePipes/shared/rscripts/nearestGene.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snakePipes/shared/rscripts/nearestGene.R b/snakePipes/shared/rscripts/nearestGene.R index ee98c0d50..12fdac694 100644 --- a/snakePipes/shared/rscripts/nearestGene.R +++ b/snakePipes/shared/rscripts/nearestGene.R @@ -30,9 +30,9 @@ if(any(is.na(size_v),sum(size_v==0)>0)){message('Some of the input files are non gs_tab<-data.table::fread(gene_symbol,header=FALSE) if(pipeline %in% c("chip-seq","ATAC-seq")){ - ibed_tab$GeneID<-t2g_tab$V2[match(ibed_tab$V19,t2g_tab$V1)] + ibed_tab$GeneID<-t2g_tab$V2[match(ibed_tab$V22,t2g_tab$V1)] ibed_tab$GeneSymbol<-gs_tab$V2[match(ibed_tab$GeneID,gs_tab$V1)] - obed_tab<-unique(subset(ibed_tab,select=c(paste0("V",c(1:18,20,21)),"GeneID","GeneSymbol"))) + obed_tab<-unique(subset(ibed_tab,select=c(paste0("V",c(1:18,23,24)),"GeneID","GeneSymbol"))) colnames(obed_tab)<-c("Chromosome","Start","End","Width","Strand","Score","nWindows","logFC.up","logFC.down","PValue","FDR","direction","rep.test","rep.logFC","best.logFC","best.test","best.start","Name","GeneStrand","Distance","GeneID","GeneSymbol") } From 5b4b22bf60007f2bd478b0924fa64ce7cc08474f Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 25 Mar 2024 13:35:01 +0100 Subject: [PATCH 19/66] Genrich ignoreForNorm --- .../shared/rules/ChIP_peak_calling.snakefile | 20 +++++++++++-------- .../rules/ChIP_peak_calling_spikein.snakefile | 16 +++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/snakePipes/shared/rules/ChIP_peak_calling.snakefile b/snakePipes/shared/rules/ChIP_peak_calling.snakefile index 5caf12382..d1c63f42b 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling.snakefile @@ -170,10 +170,11 @@ if not isMultipleComparison: bams = lambda wildcards: ",".join(expand(os.path.join("filtered_bam", "{sample}.namesorted.bam"), sample=genrichDict[wildcards.group])), blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", - control=lambda wildcards,input: ",".join(input.control) if input.control else "" + control=lambda wildcards,input: ",".join(input.control) if input.control else "", + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -188,10 +189,11 @@ if not isMultipleComparison: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - frag_size=fragmentLength + frag_size=fragmentLength, + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ else: if pairedEnd: @@ -206,10 +208,11 @@ else: bams = lambda wildcards: ",".join(expand(os.path.join("filtered_bam", "{sample}.namesorted.bam"), sample=genrichDict[wildcards.compGroup][wildcards.group])), blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", - control=lambda wildcards,input: ",".join(input.control) if input.control else "" + control=lambda wildcards,input: ",".join(input.control) if input.control else "", + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -224,10 +227,11 @@ else: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - frag_size=fragmentLength + frag_size=fragmentLength, + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ diff --git a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile index f2c5d1908..af5ac7997 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile @@ -175,10 +175,10 @@ if not isMultipleComparison: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = '-e ' + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -194,10 +194,10 @@ if not isMultipleComparison: control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", frag_size=fragmentLength, - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ else: if pairedEnd: @@ -213,10 +213,10 @@ else: blacklist = "-E {}".format(blacklist_bed) if blacklist_bed else "", control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -y 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} {params.ignoreForNorm} -y 2> {log} """ else: rule Genrich_peaks: @@ -232,10 +232,10 @@ else: control_pfx=lambda wildcards,input: "-c" if input.control else "", control=lambda wildcards,input: ",".join(input.control) if input.control else "", frag_size=fragmentLength, - spikein_chroms=",".join(spikein_chr) + ignoreForNorm = "-e " + ','.join(ignoreForNormalization) if ignoreForNormalization else "" conda: CONDA_CHIPSEQ_ENV shell: """ - Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.spikein_chroms} -w {params.frag_size} 2> {log} + Genrich -t {params.bams} {params.control_pfx} {params.control} -o {output} -r {params.blacklist} -e {params.ignoreForNorm} -w {params.frag_size} 2> {log} """ From 86140d2bdc89d333e2a0c37a4e79fbf77b79b5df Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 25 Mar 2024 14:56:28 +0100 Subject: [PATCH 20/66] added external bed --- snakePipes/workflows/ChIP-seq/ChIP-seq | 5 +++++ snakePipes/workflows/ChIP-seq/defaults.yaml | 2 ++ 2 files changed, 7 insertions(+) diff --git a/snakePipes/workflows/ChIP-seq/ChIP-seq b/snakePipes/workflows/ChIP-seq/ChIP-seq index 50d242a71..3e0d0d020 100755 --- a/snakePipes/workflows/ChIP-seq/ChIP-seq +++ b/snakePipes/workflows/ChIP-seq/ChIP-seq @@ -21,6 +21,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "pairedEnd": True, "fragmentLength":200, "bwBinSize": 25, "windowSize": 150, "predictChIPDict": None, "fromBAM": False, "bigWigType": "both", "peakCaller": "MACS2", "sampleSheet": "", + "externalBed": "", "plotFormat": "png", "bamExt": ".filtered.bam", "fdr": 0.05, "absBestLFC": 1, "useSpikeinForNorm": False, "spikeinExt": "_spikein", "peakCallerOptions": "--qvalue 0.001","cutntag": False, @@ -108,6 +109,10 @@ def parse_args(defaults={"verbose": False, "configFile": None, "are not evaluated for differential binding) (default: '%(default)s')", default=defaults["sampleSheet"]) + optional.add_argument("--externalBed", + help="A bed file with intervals to be tested for differential binding. (default: '%(default)s')", + default=defaults["externalBed"]) + optional.add_argument("--windowSize", help="Window size to counts reads in (If differential binding analysis required); " "Default size is suitable for most transcription factors and sharp histone marks. " diff --git a/snakePipes/workflows/ChIP-seq/defaults.yaml b/snakePipes/workflows/ChIP-seq/defaults.yaml index 1d8f1fcd7..3608e0a67 100755 --- a/snakePipes/workflows/ChIP-seq/defaults.yaml +++ b/snakePipes/workflows/ChIP-seq/defaults.yaml @@ -44,6 +44,8 @@ fragmentLength: 200 verbose: false # sampleSheet_DB sampleSheet: +# externalBed for DB +externalBed: # windowSize windowSize: 150 #### Flag to control the pipeline entry point From c1e89591194560f3a0671767619e4704bdb720bb Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 25 Mar 2024 15:16:39 +0100 Subject: [PATCH 21/66] added externalBed to ChIPseq/CSAW --- snakePipes/shared/rscripts/CSAW.R | 1 + snakePipes/shared/rules/CSAW.multiComp.snakefile | 4 +++- snakePipes/shared/rules/CSAW.singleComp.snakefile | 4 +++- snakePipes/workflows/ChIP-seq/Snakefile | 2 +- snakePipes/workflows/ChIP-seq/internals.snakefile | 6 ++++++ 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 44efdb091..9e5f623bd 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -98,6 +98,7 @@ if (!is.null(sampleInfo$UseRegions)) { fnames<-sampleInfo$name } + if(snakemake@params[['peakCaller']] == "MACS2") { allpeaks <- lapply(fnames, function(x) { narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx diff --git a/snakePipes/shared/rules/CSAW.multiComp.snakefile b/snakePipes/shared/rules/CSAW.multiComp.snakefile index a451b1b73..5e49fb490 100644 --- a/snakePipes/shared/rules/CSAW.multiComp.snakefile +++ b/snakePipes/shared/rules/CSAW.multiComp.snakefile @@ -22,8 +22,10 @@ def getInputPeaks(peakCaller, chip_samples, genrichDict,comp_group): return expand("SEACR/{chip_sample}_host.stringent.bed",chip_sample=chip_samples) elif pipeline == "chip-seq" and not useSpikeInForNorm: return expand("SEACR/{chip_sample}.filtered.stringent.bed",chip_sample=chip_samples) - else: + elif peakCaller == "Genrich": return expand("Genrich/{genrichGroup}.{{compGroup}}.narrowPeak", genrichGroup = genrichDict[comp_group].keys()) + elif externalBed: + return externalBed def getSizeMetrics(): diff --git a/snakePipes/shared/rules/CSAW.singleComp.snakefile b/snakePipes/shared/rules/CSAW.singleComp.snakefile index d4d24693e..c154ec539 100644 --- a/snakePipes/shared/rules/CSAW.singleComp.snakefile +++ b/snakePipes/shared/rules/CSAW.singleComp.snakefile @@ -16,8 +16,10 @@ def getInputPeaks(peakCaller, chip_samples, genrichDict): return expand("SEACR/{chip_sample}_host.stringent.bed",chip_sample=chip_samples) elif pipeline == "chip-seq" and not useSpikeInForNorm: return expand("SEACR/{chip_sample}.filtered.stringent.bed",chip_sample=chip_samples) - else: + elif peakCaller == "Genrich": return expand("Genrich/{genrichGroup}.narrowPeak", genrichGroup = genrichDict.keys()) + elif externalBed: + return externalBed def getSizeMetrics(): diff --git a/snakePipes/workflows/ChIP-seq/Snakefile b/snakePipes/workflows/ChIP-seq/Snakefile index 1cb6d6185..143807366 100755 --- a/snakePipes/workflows/ChIP-seq/Snakefile +++ b/snakePipes/workflows/ChIP-seq/Snakefile @@ -195,7 +195,7 @@ def run_genrich(): def run_macs2(): - if (peakCaller == "MACS2") or cutntag: + if peakCaller == "MACS2": if useSpikeInForNorm: file_list = expand("MACS2/{chip_sample}_host.BAM_peaks.xls", chip_sample = chip_samples) file_list.extend(expand("MACS2/{chip_sample}_host.BAM_peaks.qc.txt", chip_sample = chip_samples)) diff --git a/snakePipes/workflows/ChIP-seq/internals.snakefile b/snakePipes/workflows/ChIP-seq/internals.snakefile index ac95b5726..2ffdba470 100755 --- a/snakePipes/workflows/ChIP-seq/internals.snakefile +++ b/snakePipes/workflows/ChIP-seq/internals.snakefile @@ -276,3 +276,9 @@ if useSpikeInForNorm: else: print("\n No spikein genome detected - no spikeIn chromosomes found with extention " + spikeinExt + " .\n\n") exit(1) + +if externalBed: + if os.path.isfile(externalBed): + peakCaller = os.path.splitext(os.path.basename(sampleSheet))[0] + else: + warnings.warn("{} file not found.".format(externalBed)) From caa6c621613e31bf27e6ac7316de65aa84236dc8 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 10:45:23 +0100 Subject: [PATCH 22/66] copyfile --- snakePipes/common_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 8a5457cd1..61d4d56cf 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -848,7 +848,7 @@ def copySampleSheet(sampleSheet, wdir): if os.path.isfile(sampleSheet) and os.path.exists(wdir): bname = os.path.basename(sampleSheet) try: - shutil.copy(sampleSheet, os.path.join(wdir, bname)) + shutil.copyfile(sampleSheet, os.path.join(wdir, bname)) except Exception as err: print("Unexpected error:\n{}".format(err)) raise From 3d6327eadd4a3ce9bcc5f00482d3b54bfb51ed47 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 10:49:19 +0100 Subject: [PATCH 23/66] fix --- snakePipes/workflows/ChIP-seq/internals.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/workflows/ChIP-seq/internals.snakefile b/snakePipes/workflows/ChIP-seq/internals.snakefile index 2ffdba470..56877b89a 100755 --- a/snakePipes/workflows/ChIP-seq/internals.snakefile +++ b/snakePipes/workflows/ChIP-seq/internals.snakefile @@ -279,6 +279,6 @@ if useSpikeInForNorm: if externalBed: if os.path.isfile(externalBed): - peakCaller = os.path.splitext(os.path.basename(sampleSheet))[0] + peakCaller = os.path.splitext(os.path.basename(externalBed))[0] else: warnings.warn("{} file not found.".format(externalBed)) From 903ea4bc4920ccec683b8b059111b3ba4370a352 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 11:19:49 +0100 Subject: [PATCH 24/66] fixes external bed csaw --- snakePipes/shared/rscripts/CSAW.R | 59 +++++++++++-------- .../shared/rules/CSAW.multiComp.snakefile | 3 +- .../shared/rules/CSAW.singleComp.snakefile | 3 +- 3 files changed, 37 insertions(+), 28 deletions(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 9e5f623bd..0da07cca1 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -15,6 +15,9 @@ outdir<-snakemake@params[["outdir"]] yaml_path<-snakemake@params[["yaml_path"]] useSpikeInForNorm<-snakemake@params[["useSpikeInForNorm"]] scale_factors<-snakemake@params[["scale_factors"]] +external_bed<-as.logical(snakemake@params[["externalBed"]]) + + bam_pfx<-ifelse(useSpikeInForNorm,"_host",".filtered") bam_folder<-ifelse(useSpikeInForNorm,"split_bam","filtered_bam") @@ -42,6 +45,7 @@ message(paste("FDR:", fdr, "\n")) message(paste("LFC:", lfc, "\n")) message(paste("paired-end? :", pairedEnd, "\n")) message(paste("allele-specific? :", allelic_info, "\n")) +message(paste("External bam? ;", external_bam, "\n")) ## sampleInfo (setup of the experiment) sampleInfo <- read.table(sampleInfoFilePath, header = TRUE, colClasses = c("character", "character")) @@ -98,32 +102,36 @@ if (!is.null(sampleInfo$UseRegions)) { fnames<-sampleInfo$name } - -if(snakemake@params[['peakCaller']] == "MACS2") { - allpeaks <- lapply(fnames, function(x) { - narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx - if(snakemake@params[["pipeline"]] %in% "ATAC-seq"){ - narrow <- paste0("../MACS2/",x,".filtered.short.BAM_peaks.narrowPeak") - } - broad <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.broadPeak") #bam_pfx - # first look for narrowpeak then braod peak - if(file.exists(narrow)) { - bed <- read.delim(narrow, header = FALSE) - } else if (file.exists(broad)) { - bed <- read.delim(broad, header = FALSE) - } else { - stop("MACS2 output doesn't exist. Neither ", narrow, " , nor ", broad) - } - - bed.gr <- GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) - return(bed.gr) - }) -} else { - allpeaks = lapply(snakemake@input[['peaks']], function(x) { - bed = read.delim(paste0("../", x), header=FALSE) +if !( external_bed) { + if(snakemake@params[['peakCaller']] == "MACS2") { + allpeaks <- lapply(fnames, function(x) { + narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx + if(snakemake@params[["pipeline"]] %in% "ATAC-seq"){ + narrow <- paste0("../MACS2/",x,".filtered.short.BAM_peaks.narrowPeak") + } + broad <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.broadPeak") #bam_pfx + #first look for narrowpeak then braod peak + if(file.exists(narrow)) { + bed <- read.delim(narrow, header = FALSE) + } else if (file.exists(broad)) { + bed <- read.delim(broad, header = FALSE) + } else { + stop("MACS2 output doesn't exist. Neither ", narrow, " , nor ", broad) + } + + bed.gr <- GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) + return(bed.gr) + }) + } else { + allpeaks = lapply(snakemake@input[['peaks']], function(x) { + bed = read.delim(paste0("../", x), header=FALSE) + bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) + return(bed.gr) + }) + } + } else { + bed = read.delim(snakemake@input[['pekas']],header=FALSE) bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) - return(bed.gr) - }) } # merge @@ -166,4 +174,3 @@ sink("CSAW.session_info.txt") sessionInfo() sink() -print("DONE..!") diff --git a/snakePipes/shared/rules/CSAW.multiComp.snakefile b/snakePipes/shared/rules/CSAW.multiComp.snakefile index 5e49fb490..fe9f6d124 100644 --- a/snakePipes/shared/rules/CSAW.multiComp.snakefile +++ b/snakePipes/shared/rules/CSAW.multiComp.snakefile @@ -105,7 +105,8 @@ rule CSAW: insert_size_metrics = lambda wildcards,input: os.path.join(outdir, input.insert_size_metrics) if pairedEnd else [], pipeline = pipeline, useSpikeInForNorm = useSpikeInForNorm, - scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "" + scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "", + externalBed = True if externalBed else False log: out = os.path.join(outdir, "{}/logs/CSAW.out".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))), err = os.path.join(outdir, "{}/logs/CSAW.err".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))) diff --git a/snakePipes/shared/rules/CSAW.singleComp.snakefile b/snakePipes/shared/rules/CSAW.singleComp.snakefile index c154ec539..013732eee 100644 --- a/snakePipes/shared/rules/CSAW.singleComp.snakefile +++ b/snakePipes/shared/rules/CSAW.singleComp.snakefile @@ -89,7 +89,8 @@ rule CSAW: insert_size_metrics = lambda wildcards,input: os.path.join(outdir, input.insert_size_metrics) if pairedEnd else [], pipeline = pipeline, useSpikeInForNorm = useSpikeInForNorm, - scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "" + scale_factors = lambda wildcards, input: os.path.join(outdir, input.scale_factors) if input.scale_factors else "", + externalBed = True if externalBed else False log: out = os.path.join(outdir, "CSAW_{}_{}/logs/CSAW.out".format(peakCaller, sample_name)), err = os.path.join(outdir, "CSAW_{}_{}/logs/CSAW.err".format(peakCaller, sample_name)) From dea59511e36922edb1279b41d6cfb7ddddb7238f Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 11:26:59 +0100 Subject: [PATCH 25/66] fix --- snakePipes/shared/rscripts/CSAW.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 0da07cca1..1017f5464 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -45,7 +45,7 @@ message(paste("FDR:", fdr, "\n")) message(paste("LFC:", lfc, "\n")) message(paste("paired-end? :", pairedEnd, "\n")) message(paste("allele-specific? :", allelic_info, "\n")) -message(paste("External bam? ;", external_bam, "\n")) +message(paste("External bed? ;", external_bed, "\n")) ## sampleInfo (setup of the experiment) sampleInfo <- read.table(sampleInfoFilePath, header = TRUE, colClasses = c("character", "character")) From 0af4a7b53eec8180520357d6e7c4c697a25e72bb Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 11:51:45 +0100 Subject: [PATCH 26/66] fix --- snakePipes/shared/rscripts/CSAW.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 1017f5464..12d85ba1e 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -102,7 +102,7 @@ if (!is.null(sampleInfo$UseRegions)) { fnames<-sampleInfo$name } -if !( external_bed) { +if (! external_bed) { if(snakemake@params[['peakCaller']] == "MACS2") { allpeaks <- lapply(fnames, function(x) { narrow <- paste0("../MACS2/",x,bam_pfx,".BAM_peaks.narrowPeak") #bam_pfx From 636f2446efe31f5cfb0fc8e49e90d4c9bbfea1dd Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 13:43:33 +0100 Subject: [PATCH 27/66] typo --- snakePipes/shared/rscripts/CSAW.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 12d85ba1e..7ee288120 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -130,7 +130,7 @@ if (! external_bed) { }) } } else { - bed = read.delim(snakemake@input[['pekas']],header=FALSE) + bed = read.delim(snakemake@input[['peaks']],header=FALSE) bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) } From 6b5c067cd68f4f4e8068b275b0e8916440334b98 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 14:59:41 +0100 Subject: [PATCH 28/66] fix --- snakePipes/shared/rscripts/CSAW.R | 1 + 1 file changed, 1 insertion(+) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 7ee288120..41c5b8878 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -132,6 +132,7 @@ if (! external_bed) { } else { bed = read.delim(snakemake@input[['peaks']],header=FALSE) bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) + allpeaks <- bed.gr } # merge From 49765ad9ceb03e5cb113979d5f00b2c80ca19700 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 26 Mar 2024 15:49:14 +0100 Subject: [PATCH 29/66] fix gr --- snakePipes/shared/rscripts/CSAW.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R index 41c5b8878..150941f78 100644 --- a/snakePipes/shared/rscripts/CSAW.R +++ b/snakePipes/shared/rscripts/CSAW.R @@ -128,6 +128,8 @@ if (! external_bed) { bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4) return(bed.gr) }) + # merge + allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) } } else { bed = read.delim(snakemake@input[['peaks']],header=FALSE) @@ -136,7 +138,7 @@ if (! external_bed) { } # merge -allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) +#allpeaks <- Reduce(function(x,y) GenomicRanges::union(x,y), allpeaks) ## keep only these peaks for testing DB message(paste0("Filtering windows using MACS2 output : ", length(allpeaks) , " regions used (Union of peaks)")) From 644825ff58072030a3dd77c5cb8e5408a9c6ec46 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 12:42:01 +0100 Subject: [PATCH 30/66] test dag --- .ci_stuff/test_dag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index f4f4bcf74..2a7d2de35 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -197,7 +197,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 628 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --bigWigType log2ratio .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 562 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 759 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 815 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1203 ]; then exit 1 ; fi #noInput From 9211fdbb0929305c8240275b446a4db3a2debdf3 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 12:55:55 +0100 Subject: [PATCH 31/66] typo --- snakePipes/workflows/ChIP-seq/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/workflows/ChIP-seq/Snakefile b/snakePipes/workflows/ChIP-seq/Snakefile index 143807366..213e65189 100755 --- a/snakePipes/workflows/ChIP-seq/Snakefile +++ b/snakePipes/workflows/ChIP-seq/Snakefile @@ -210,7 +210,7 @@ def run_seacr(): if (peakCaller == "SEACR"): if useSpikeInForNorm: file_list = expand("SEACR/{chip_sample}_host.stringent.bed", chip_sample = chip_samples) - file_list.append(expand("SEACR/{chip_sample}.host.stringent_peaks.qc.txt",chip_sample=chip_samples)) + file_list.append(expand("SEACR/{chip_sample}_host.stringent_peaks.qc.txt",chip_sample=chip_samples)) else: file_list = expand("SEACR/{chip_sample}.filtered.stringent.bed", chip_sample = chip_samples) file_list.append(expand("SEACR/{chip_sample}.filtered.stringent_peaks.qc.txt",chip_sample=chip_samples)) From 990899b06600858c0403ebf89a31425370647e2b Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 13:03:46 +0100 Subject: [PATCH 32/66] test dag --- .ci_stuff/test_dag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index 2a7d2de35..bb4b71861 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -199,7 +199,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 562 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 815 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1203 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1259 ]; then exit 1 ; fi #noInput WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 403 ]; then exit 1 ; fi From 27a874feefe6e240f2b7a5f70bb5b1748d1656f9 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 13:25:54 +0100 Subject: [PATCH 33/66] test dag --- .ci_stuff/test_dag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index bb4b71861..c592724cc 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -206,7 +206,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 403 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller Genrich .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 349 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 469 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 525 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 696 ]; then exit 1 ; fi # fromBAM From 9e8bc7b74004e5959519ec8315514d88e00afcf1 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 13:31:13 +0100 Subject: [PATCH 34/66] test dag --- .ci_stuff/test_dag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index c592724cc..afdf07b32 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -208,7 +208,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 349 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 525 ]; then exit 1 ; fi WC=`ChIP-seq -d BAM_input --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --peakCaller SEACR --useSpikeInForNorm .ci_stuff/spikein_organism.yaml .ci_stuff/ChIP.sample_noControl_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 696 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 752 ]; then exit 1 ; fi # fromBAM WC=`ChIP-seq -d outdir --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --fromBAM BAM_input/filtered_bam/ .ci_stuff/organism.yaml .ci_stuff/ChIP.sample_config.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1187 ]; then exit 1 ; fi From 81752d717bb91888636e182623fa80f2c9e28273 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 13:41:10 +0100 Subject: [PATCH 35/66] external bed atacseq --- snakePipes/workflows/ATAC-seq/ATAC-seq | 5 +++++ snakePipes/workflows/ATAC-seq/defaults.yaml | 1 + snakePipes/workflows/ATAC-seq/internals.snakefile | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/snakePipes/workflows/ATAC-seq/ATAC-seq b/snakePipes/workflows/ATAC-seq/ATAC-seq index 25e0d37f2..386f8b457 100755 --- a/snakePipes/workflows/ATAC-seq/ATAC-seq +++ b/snakePipes/workflows/ATAC-seq/ATAC-seq @@ -20,6 +20,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, "reads": ["_R1", "_R2"], "maxFragmentSize": 150, "minFragmentSize": 0, "fromBAM": False ,"qval": 0.001 ,"sampleSheet": None, + "externalBed": None, "bamExt": "filtered.bam", "fdr": 0.05, "absBestLFC": 1, "UMIDedup": False, "UMIDedupOpts": "", "UMIDedupSep": "_", @@ -71,6 +72,10 @@ def parse_args(defaults={"verbose": False, "configFile": None, help="Invoke differential accessibility analysis by providing information on samples; see 'https://github.com/maxplanck-ie/snakepipes/tree/master/docs/content/sampleSheet.example.tsv' for example. IMPORTANT: The first entry defines which group of samples are control. With this, the order of comparison and likewise the sign of values can be changed! Also, the condition `control` should not be used (reserved to mark input samples in the ChIP-Seq workflow (default: '%(default)s').", default=defaults["sampleSheet"]) + optional.add_argument("--externalBed", + help="A bed file with intervals to be tested for differential binding. (default: '%(default)s')", + default=defaults["externalBed"]) + optional.add_argument("--fromBAM", dest="fromBAM", help="Input folder with bam files. If provided, the analysis will start from this point. (default: '%(default)s')", diff --git a/snakePipes/workflows/ATAC-seq/defaults.yaml b/snakePipes/workflows/ATAC-seq/defaults.yaml index 6bc3d2f65..64e332020 100644 --- a/snakePipes/workflows/ATAC-seq/defaults.yaml +++ b/snakePipes/workflows/ATAC-seq/defaults.yaml @@ -29,6 +29,7 @@ minFragmentSize: 0 verbose: false # sampleSheet_DB sampleSheet: +externalBed: # windowSize windowSize: 20 fragmentCountThreshold: 1 diff --git a/snakePipes/workflows/ATAC-seq/internals.snakefile b/snakePipes/workflows/ATAC-seq/internals.snakefile index 0908411b0..b26325039 100644 --- a/snakePipes/workflows/ATAC-seq/internals.snakefile +++ b/snakePipes/workflows/ATAC-seq/internals.snakefile @@ -84,3 +84,9 @@ if sampleSheet: reordered_dict[g] = {k: filtered_dict[k] for k in [item for sublist in genrichDict[g].values() for item in sublist]} else: genrichDict = {"all_samples": samples} + +if externalBed: + if os.path.isfile(externalBed): + peakCaller = os.path.splitext(os.path.basename(externalBed))[0] + else: + warnings.warn("{} file not found.".format(externalBed)) From 483235346efc429c750c4a10e56499904c5608c4 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 27 Mar 2024 14:12:07 +0100 Subject: [PATCH 36/66] pytest --- tests/test_jobcounts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_jobcounts.py b/tests/test_jobcounts.py index 67f429cb3..242eb3951 100644 --- a/tests/test_jobcounts.py +++ b/tests/test_jobcounts.py @@ -585,7 +585,7 @@ def test_seacr(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 71 + assert parseSpOut(_p) == 77 def test_seacr_spikein(self, ifs): ci = [ "ChIP-seq", @@ -604,7 +604,7 @@ def test_seacr_spikein(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 112 + assert parseSpOut(_p) == 118 def test_SE(self, ifs): ci = [ "ChIP-seq", @@ -691,7 +691,7 @@ def test_seacr_noInput(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 44 + assert parseSpOut(_p) == 50 def test_seacr_spikein_noInput(self, ifs): ci = [ "ChIP-seq", @@ -710,7 +710,7 @@ def test_seacr_spikein_noInput(self, ifs): print(' '.join([str(i) for i in ci])) _p = sp.run(ci, capture_output=True, text=True) assert _p.returncode == 0 - assert parseSpOut(_p) == 65 + assert parseSpOut(_p) == 71 def test_frombam(self, ifs): ci = [ "ChIP-seq", From bc7722654623ee998f46f110dcfffcc4a61b9bec Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Thu, 4 Apr 2024 09:48:29 +0200 Subject: [PATCH 37/66] mamba minus force --- .github/workflows/linux.yml | 2 +- .github/workflows/osx.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 92d0696fb..c641cf909 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -127,4 +127,4 @@ jobs: run: | micromamba activate snakePipes_CI snakePipes config --tempDir /tmp - snakePipes createEnvs --autodetectCondaEnvDir --force --only ${{matrix.envs}} + snakePipes createEnvs --autodetectCondaEnvDir --only ${{matrix.envs}} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 2decc8c06..de0b1c84e 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -52,5 +52,5 @@ jobs: - name: createEnvsOSX run: | micromamba activate snakePipes_CI - snakePipes createEnvs --force --autodetectCondaEnvDir --only ${{matrix.envs}} + snakePipes createEnvs --autodetectCondaEnvDir --only ${{matrix.envs}} From 2c64ad6e9a29639108b83957987fedc50c2646fd Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Thu, 4 Apr 2024 10:00:14 +0200 Subject: [PATCH 38/66] rm force from the wrapper --- bin/snakePipes | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/bin/snakePipes b/bin/snakePipes index 754aa0a37..f74a022dd 100755 --- a/bin/snakePipes +++ b/bin/snakePipes @@ -54,11 +54,11 @@ def parse_arguments(): "possible environments are: {}".format(cof.set_env_yamls().keys()), ) - createEnvsParser.add_argument( - "--force", - action="store_true", - help="Force creation of conda environments, even if they apparently exist.", - ) +# createEnvsParser.add_argument( +# "--force", +# action="store_true", +# help="Force creation of conda environments, even if they apparently exist.", +# ) createEnvsParser.add_argument( "--info", @@ -334,7 +334,6 @@ def createCondaEnvs(args): "mamba", "env", "create", - "--force", "--file", os.path.join(baseDir, "shared/rules", env), ] @@ -342,7 +341,7 @@ def createCondaEnvs(args): # Don't actually create the env if either --info is set or it already exists and --force is NOT set if not args.info: - if not os.path.exists(os.path.join(condaDirUse, h)) or args.force: + if not os.path.exists(os.path.join(condaDirUse, h)): try: os.makedirs(os.path.join(condaDirUse, h), exist_ok=True) subprocess.check_call(cmd) From 46af233bdc248514721516368d04722b2fabde51 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Thu, 4 Apr 2024 10:39:46 +0200 Subject: [PATCH 39/66] news.rst --- docs/content/News.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/content/News.rst b/docs/content/News.rst index bdda58274..be3c759eb 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -1,5 +1,19 @@ snakePipes News =============== + +snakePipes x.y.z +________________ + +* added SEACR peaks qc +* added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows +* fixed copyfile command for sampleSheet +* removed deprecated --force argument from mamba commands +* fixes #998 +* fixes #997 +* fixes #996 + + + snakePipes 2.8.1 ---------------- * Boosted versions on shared_env, as python 3.7 and multiqc no longer work together. From 71e243f7b0cf22e64f3dc9bacae921879770de94 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Fri, 5 Apr 2024 12:59:21 +0200 Subject: [PATCH 40/66] fix seacr control --- snakePipes/shared/rules/ChIP_peak_calling.snakefile | 2 +- snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/snakePipes/shared/rules/ChIP_peak_calling.snakefile b/snakePipes/shared/rules/ChIP_peak_calling.snakefile index d1c63f42b..fca12caba 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling.snakefile @@ -257,7 +257,7 @@ rule SEACR_peaks: "SEACR/{chip_sample}.filtered.stringent.bed" log: "SEACR/logs/{chip_sample}.log" params: - fdr = fdr, + fdr = lambda wildcards,input: fdr if not input.control else "", prefix = os.path.join(outdir,"SEACR/{chip_sample}.filtered"), script=os.path.join(maindir, "shared","tools/SEACR-1.3/SEACR_1.3.sh") conda: CONDA_SEACR_ENV diff --git a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile index af5ac7997..51b083d43 100755 --- a/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile +++ b/snakePipes/shared/rules/ChIP_peak_calling_spikein.snakefile @@ -257,7 +257,7 @@ rule SEACR_peaks: "SEACR/{chip_sample}_host.stringent.bed" log: "SEACR/logs/{chip_sample}.log" params: - fdr = fdr, + fdr = lambda wildcards,input: fdr if not input.control else "", prefix = os.path.join(outdir,"SEACR/{chip_sample}_host"), script=os.path.join(maindir, "shared","tools/SEACR-1.3/SEACR_1.3.sh") conda: CONDA_SEACR_ENV From b4dee94a6f095ff84699c906b28f6aee2e6507af Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Fri, 5 Apr 2024 13:40:57 +0200 Subject: [PATCH 41/66] fixes seacr with control --- snakePipes/shared/rules/envs/chip_seacr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/snakePipes/shared/rules/envs/chip_seacr.yaml b/snakePipes/shared/rules/envs/chip_seacr.yaml index aaa69050f..35bcc1e8d 100644 --- a/snakePipes/shared/rules/envs/chip_seacr.yaml +++ b/snakePipes/shared/rules/envs/chip_seacr.yaml @@ -6,3 +6,4 @@ channels: dependencies: - seacr - ucsc-bigwigtobedgraph + - bedtools From 6a2073e3c0443aa113fae28ca6c6fc686aeaa97f Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 8 Apr 2024 15:10:50 +0200 Subject: [PATCH 42/66] added allelic-counting mode to mRNAseq --- snakePipes/shared/rules/LinkBam.snakefile | 103 ++++++++++++++-------- snakePipes/workflows/mRNA-seq/Snakefile | 9 +- snakePipes/workflows/mRNA-seq/mRNA-seq | 10 ++- 3 files changed, 77 insertions(+), 45 deletions(-) diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index 0514049b6..d95610614 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -1,49 +1,74 @@ import os -rule link_bam: - input: - indir + "/{sample}" + bamExt - output: - aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam" - params: - input_bai = indir + "/{sample}" + bamExt + ".bai", - output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai" - run: - if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): - os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) - if not os.path.exists(os.path.join(outdir,output[0])): - os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) - -if not pipeline=="noncoding-rna-seq": +if pipeline=="mRNAseq" and mode in "allelic-counting": + rule link_bam: + input: + indir + "/{sample}.{suffix}" + bamExt + output: + "allelic_bams/{sample}.{suffix} + bamExt + params: + input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai", + output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai" + run: + if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): + os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) + if not os.path.exists(os.path.join(outdir,output[0])): + os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) + rule samtools_index_external: input: - aligner + "/{sample}.bam" + "allelic_bams/{sample}.{suffix} + bamExt output: - aligner + "/{sample}.bam.bai" + "allelic_bams/{sample}.{suffix} + bamExt + ".bai" conda: CONDA_SHARED_ENV shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi" - if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC: - rule link_bam_bai_external: + +else: + rule link_bam: + input: + indir + "/{sample}" + bamExt + output: + aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam" + params: + input_bai = indir + "/{sample}" + bamExt + ".bai", + output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai" + run: + if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)): + os.symlink(params.input_bai,os.path.join(outdir,params.output_bai)) + if not os.path.exists(os.path.join(outdir,output[0])): + os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0])) + + if not pipeline=="noncoding-rna-seq": + rule samtools_index_external: input: - bam = aligner + "/{sample}.bam", - bai = aligner + "/{sample}.bam.bai" + aligner + "/{sample}.bam" output: - bam_out = "filtered_bam/{sample}.filtered.bam", - bai_out = "filtered_bam/{sample}.filtered.bam.bai", - shell: """ - ln -s ../{input.bam} {output.bam_out}; - ln -s ../{input.bai} {output.bai_out} - """ - - - rule sambamba_flagstat: - input: - aligner + "/{sample}.bam" - output: - "Sambamba/{sample}.markdup.txt" - log: "Sambamba/logs/{sample}.flagstat.log" - conda: CONDA_SAMBAMBA_ENV - shell: """ - sambamba flagstat -p {input} > {output} 2> {log} - """ + aligner + "/{sample}.bam.bai" + conda: CONDA_SHARED_ENV + shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi" + + if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC: + rule link_bam_bai_external: + input: + bam = aligner + "/{sample}.bam", + bai = aligner + "/{sample}.bam.bai" + output: + bam_out = "filtered_bam/{sample}.filtered.bam", + bai_out = "filtered_bam/{sample}.filtered.bam.bai", + shell: """ + ln -s ../{input.bam} {output.bam_out}; + ln -s ../{input.bai} {output.bai_out} + """ + + + rule sambamba_flagstat: + input: + aligner + "/{sample}.bam" + output: + "Sambamba/{sample}.markdup.txt" + log: "Sambamba/logs/{sample}.flagstat.log" + conda: CONDA_SAMBAMBA_ENV + shell: """ + sambamba flagstat -p {input} > {output} 2> {log} + """ diff --git a/snakePipes/workflows/mRNA-seq/Snakefile b/snakePipes/workflows/mRNA-seq/Snakefile index 9d2c968f9..5252331d4 100755 --- a/snakePipes/workflows/mRNA-seq/Snakefile +++ b/snakePipes/workflows/mRNA-seq/Snakefile @@ -100,11 +100,14 @@ else: include: os.path.join(maindir, "shared", "rules", "SNPsplit.snakefile") # deepTools QC include: os.path.join(maindir, "shared", "rules", "deepTools_RNA_allelic.snakefile") + elif "allelic-counting" in mode: + allele_mode = "from_split_bam" + include: os.path.join(maindir, "shared", "rules", "deepTools_RNA_allelic.snakefile") include: os.path.join(maindir, "shared", "rules", "LinkBam.snakefile") -if "allelic-mapping" in mode: +if "allelic-mapping" in mode or "allelic-counting" in mode: ## featureCounts_allelic include: os.path.join(maindir, "shared", "rules", "featureCounts_allelic.snakefile") else: @@ -123,7 +126,7 @@ include:os.path.join(maindir, "shared", "rules", "RNA-seq_qc_report.snakefile") ## DESeq2 if sampleSheet and not "three-prime-seq" in mode: - if isMultipleComparison and not "allelic-mapping" in mode: + if isMultipleComparison and not "allelic-mapping" in mode and not "allelic-counting" in mode": include: os.path.join(maindir, "shared", "rules", "DESeq2.multipleComp.snakefile") if rMats: include: os.path.join(maindir, "shared", "rules", "rMats.multipleComp.snakefile") @@ -239,7 +242,7 @@ def make_nmasked_genome(): return([]) def run_allelesp_mapping(): - if "allelic-mapping" in mode: + if "allelic-mapping" in mode or "allelic-counting" in mode: allele_suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned'] file_list = [ expand("allelic_bams/{sample}.{suffix}.sorted.bam", sample = samples, diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index 80f369abd..2f4bbcd27 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -14,6 +14,7 @@ import sys import textwrap import snakePipes.common_functions as cf import snakePipes.parserCommon as parserCommon +import warnings def parse_args(defaults={"verbose": False, "configFile": None, @@ -51,7 +52,7 @@ def parse_args(defaults={"verbose": False, "configFile": None, # Workflow options optional = parser.add_argument_group('Options') optional.add_argument("-m", "--mode", - help="workflow running modes (available: 'alignment-free, alignment, allelic-mapping, deepTools_qc, three-prime-seq')" + help="workflow running modes (available: 'alignment-free, alignment, allelic-mapping, allelic-counting, deepTools_qc, three-prime-seq')" " (default: '%(default)s')", default=defaults["mode"]) @@ -159,16 +160,19 @@ def main(): args.SNPfile = os.path.abspath(args.SNPfile) args.NMaskedIndex = os.path.abspath(args.NMaskedIndex) modeTemp = args.mode.split(",") - validModes = set(["alignment", "alignment-free", "deepTools_qc", "allelic-mapping","three-prime-seq"]) + validModes = set(["alignment", "alignment-free", "deepTools_qc", "allelic-mapping", "allelic-counting", "three-prime-seq"]) for mode in modeTemp: if mode not in validModes: sys.exit("{} is not a valid mode!\n".format(mode)) if "alignment" not in modeTemp and args.UMIDedup: sys.exit("UMIDedup is only valid for \"alignment\" mode!\n") if args.fromBAM and ("alignment-free" in modeTemp ): - sys.exit("\n--fromBAM can only be used with modes \'alignment\' , \'allelic-mapping\' or \'deepTools_qc\' - use one of these modes or provide fastq files!\n") + sys.exit("\n--fromBAM can only be used with modes \'alignment\' , \'allelic-mapping\' , \'allelic-counting\' or \'deepTools_qc\' - use one of these modes or provide fastq files!\n") if args.fromBAM: args.aligner = "EXTERNAL_BAM" + if "allelic-counting" in modeTemp and not args.fromBAM: + warnings.warn("--fromBAM is required with allelic-counting mode. Setting to True.") + args.fromBAM = True if args.rMats and not args.sampleSheet: sys.exit("--rMats flag requires a sampleSheet (specified with --sampleSheet).\n") if "three_prime_seq" in mode: From f5ff941cccb82cde288de6138d66386c4c9a1323 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Mon, 8 Apr 2024 15:26:23 +0200 Subject: [PATCH 43/66] test da --- .ci_stuff/test_dag.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index afdf07b32..fe5a5228d 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -314,6 +314,10 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi +WC=`mRNA-seq -m allelic-counting,deepTools_qc -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1408 ]; then exit 1 ; fi + + WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1370 ]; then exit 1 ; fi From 57e0cd4b000587656c95e24d7917c22da8f2373a Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 09:34:23 +0200 Subject: [PATCH 44/66] quotes --- snakePipes/shared/rules/LinkBam.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index d95610614..d58753002 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -5,7 +5,7 @@ if pipeline=="mRNAseq" and mode in "allelic-counting": input: indir + "/{sample}.{suffix}" + bamExt output: - "allelic_bams/{sample}.{suffix} + bamExt + "allelic_bams/{sample}.{suffix}" + bamExt params: input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai", output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai" From 67926b81a8bb3745c4eb74eceec695b44b8d9afc Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 09:43:11 +0200 Subject: [PATCH 45/66] quotes --- snakePipes/shared/rules/LinkBam.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index d58753002..432137d6f 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -17,9 +17,9 @@ if pipeline=="mRNAseq" and mode in "allelic-counting": rule samtools_index_external: input: - "allelic_bams/{sample}.{suffix} + bamExt + "allelic_bams/{sample}.{suffix}" + bamExt output: - "allelic_bams/{sample}.{suffix} + bamExt + ".bai" + "allelic_bams/{sample}.{suffix}" + bamExt + ".bai" conda: CONDA_SHARED_ENV shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi" From f4dc7aeddedb41a40d1297ae3cfa638b482064f5 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 09:48:05 +0200 Subject: [PATCH 46/66] quotes --- snakePipes/workflows/mRNA-seq/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/workflows/mRNA-seq/Snakefile b/snakePipes/workflows/mRNA-seq/Snakefile index 5252331d4..df100e1b6 100755 --- a/snakePipes/workflows/mRNA-seq/Snakefile +++ b/snakePipes/workflows/mRNA-seq/Snakefile @@ -126,7 +126,7 @@ include:os.path.join(maindir, "shared", "rules", "RNA-seq_qc_report.snakefile") ## DESeq2 if sampleSheet and not "three-prime-seq" in mode: - if isMultipleComparison and not "allelic-mapping" in mode and not "allelic-counting" in mode": + if isMultipleComparison and not "allelic-mapping" in mode and not "allelic-counting" in mode: include: os.path.join(maindir, "shared", "rules", "DESeq2.multipleComp.snakefile") if rMats: include: os.path.join(maindir, "shared", "rules", "rMats.multipleComp.snakefile") From 6fe56df464fd088cbdc2a37f8237533a34e96bb6 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 10:09:03 +0200 Subject: [PATCH 47/66] get samples from bam suffix --- snakePipes/common_functions.py | 13 +++++++++++++ snakePipes/workflows/mRNA-seq/internals.snakefile | 5 ++++- snakePipes/workflows/mRNA-seq/mRNA-seq | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 61d4d56cf..0a3fcf867 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -172,6 +172,19 @@ def get_sample_names_bam(infiles, bamExt): s.append(x) return sorted(list(set(s))) +def get_sample_names_suffix_bam(infiles, bamExt): + """ + Get sample names without file extensions + """ + bamSuff = [x + bamExt for x in ["genome1","genome2","unassigned","allele_flagged"]] + s = [] + for x in infiles: + for y in bamSuff: + if y in os.path.basename(x): + x = os.path.basename(x).replace(y, "") + s.append(x) + return sorted(list(set(s))) + def is_paired(infiles, ext, reads): """ diff --git a/snakePipes/workflows/mRNA-seq/internals.snakefile b/snakePipes/workflows/mRNA-seq/internals.snakefile index f9e3c6ba8..246fa5a3b 100644 --- a/snakePipes/workflows/mRNA-seq/internals.snakefile +++ b/snakePipes/workflows/mRNA-seq/internals.snakefile @@ -37,7 +37,10 @@ if not fromBAM: pairedEnd = cf.is_paired(infiles, ext, reads) else: infiles = sorted(glob.glob(os.path.join(str(indir or ''), '*' + bamExt))) - samples = cf.get_sample_names_bam(infiles, bamExt) + if mode in "allelic-counting": + samples = cf.get_sample_names_suffix_bam(infiles, bamExt) + else: + samples = cf.get_sample_names_bam(infiles, bamExt) if formula and not sampleSheet: print("In order to apply custom formula, please provide a sample sheet!") diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index 2f4bbcd27..755399773 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -173,6 +173,7 @@ def main(): if "allelic-counting" in modeTemp and not args.fromBAM: warnings.warn("--fromBAM is required with allelic-counting mode. Setting to True.") args.fromBAM = True + args.aligner = "EXTERNAL_BAM" if args.rMats and not args.sampleSheet: sys.exit("--rMats flag requires a sampleSheet (specified with --sampleSheet).\n") if "three_prime_seq" in mode: From f7a3cdb4dd0392a99972ee2f3c5897cb52f8b570 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 10:15:44 +0200 Subject: [PATCH 48/66] fix --- snakePipes/workflows/mRNA-seq/internals.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/workflows/mRNA-seq/internals.snakefile b/snakePipes/workflows/mRNA-seq/internals.snakefile index 246fa5a3b..93470fd77 100644 --- a/snakePipes/workflows/mRNA-seq/internals.snakefile +++ b/snakePipes/workflows/mRNA-seq/internals.snakefile @@ -37,7 +37,7 @@ if not fromBAM: pairedEnd = cf.is_paired(infiles, ext, reads) else: infiles = sorted(glob.glob(os.path.join(str(indir or ''), '*' + bamExt))) - if mode in "allelic-counting": + if "allelic-counting" in mode: samples = cf.get_sample_names_suffix_bam(infiles, bamExt) else: samples = cf.get_sample_names_bam(infiles, bamExt) From b23d8505cf14f76f6baf5a9b664942b79e4810b7 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 10:23:19 +0200 Subject: [PATCH 49/66] dot --- snakePipes/common_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 0a3fcf867..43d2a6183 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -176,7 +176,7 @@ def get_sample_names_suffix_bam(infiles, bamExt): """ Get sample names without file extensions """ - bamSuff = [x + bamExt for x in ["genome1","genome2","unassigned","allele_flagged"]] + bamSuff = [x + bamExt for x in [".genome1",".genome2",".unassigned",".allele_flagged"]] s = [] for x in infiles: for y in bamSuff: From 85f2299039ec55097bd632d49bd32b54157bb745 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 10:33:37 +0200 Subject: [PATCH 50/66] link bam --- snakePipes/shared/rules/LinkBam.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index 432137d6f..f27696d2e 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -1,6 +1,6 @@ import os -if pipeline=="mRNAseq" and mode in "allelic-counting": +if pipeline=="mRNAseq" and "allelic-counting" in mode: rule link_bam: input: indir + "/{sample}.{suffix}" + bamExt From 94c47ea0239cdeb8de41555dbb9bc9d603f6e4b4 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 10:47:34 +0200 Subject: [PATCH 51/66] pipeline --- snakePipes/shared/rules/LinkBam.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile index f27696d2e..ea498b0f4 100644 --- a/snakePipes/shared/rules/LinkBam.snakefile +++ b/snakePipes/shared/rules/LinkBam.snakefile @@ -1,6 +1,6 @@ import os -if pipeline=="mRNAseq" and "allelic-counting" in mode: +if pipeline=="rna-seq" and "allelic-counting" in mode: rule link_bam: input: indir + "/{sample}.{suffix}" + bamExt From d4f7d44b2621b86f676be4b1daec179c4713f9c1 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 11:10:34 +0200 Subject: [PATCH 52/66] test dag --- .ci_stuff/test_dag.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index fe5a5228d..5fe7fab19 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -101,6 +101,30 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \ allelic_BAM_input/allelic_bams/sample5.genome2.sorted.bam.bai \ allelic_BAM_input/allelic_bams/sample6.genome1.sorted.bam.bai \ allelic_BAM_input/allelic_bams/sample6.genome2.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam \ + allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam \ + allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam.bai \ + allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam.bai \ allelic_BAM_input/deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv \ allelic_BAM_input/filtered_bam/sample1.filtered.bam \ allelic_BAM_input/filtered_bam/sample2.filtered.bam \ From 9eeb65da41510df28cfe9297ab69e654fdad48cc Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 11:24:52 +0200 Subject: [PATCH 53/66] deseq2 --- snakePipes/shared/rules/DESeq2.singleComp.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 3464a52be..538a0fa85 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -7,7 +7,7 @@ def get_outdir(folder_name,sampleSheet): ## DESeq2 (on featureCounts) rule DESeq2: input: - counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv", + counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode or "allelic-counting" in mode else "featureCounts/counts.tsv", sampleSheet = sampleSheet, symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file output: From 013a780a64f2e53c759a986b6cbe9b89a3878f4e Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 11:32:56 +0200 Subject: [PATCH 54/66] rm nonallelic deeptools qc --- snakePipes/workflows/mRNA-seq/Snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/snakePipes/workflows/mRNA-seq/Snakefile b/snakePipes/workflows/mRNA-seq/Snakefile index df100e1b6..c4c97a8a9 100755 --- a/snakePipes/workflows/mRNA-seq/Snakefile +++ b/snakePipes/workflows/mRNA-seq/Snakefile @@ -27,8 +27,9 @@ include: os.path.join(maindir, "shared", "tools" , "deeptools_cmds.snakefile") ## filtered annotation (GTF) include: os.path.join(maindir, "shared", "rules", "filterGTF.snakefile") -## bamCoverage_RPKM -include: os.path.join(maindir, "shared", "rules", "deepTools_RNA.snakefile") +if not "allelic-counting" in mode: + ## bamCoverage_RPKM + include: os.path.join(maindir, "shared", "rules", "deepTools_RNA.snakefile") if not fromBAM: From 3aadc6e4c7bee1b9be3bc11f711190f3a2d999a5 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 11:45:03 +0200 Subject: [PATCH 55/66] multiqc --- snakePipes/shared/rules/multiQC.snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/snakePipes/shared/rules/multiQC.snakefile b/snakePipes/shared/rules/multiQC.snakefile index 308cbc060..efc2df5b0 100755 --- a/snakePipes/shared/rules/multiQC.snakefile +++ b/snakePipes/shared/rules/multiQC.snakefile @@ -66,7 +66,7 @@ def multiqc_input_check(return_value): indir += "allelic_bams" elif pipeline=="rna-seq": # must be RNA-mapping, add files as per the mode - if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode: + if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode and not "allelic-counting" in mode: infiles.append( expand(aligner+"/{sample}.bam", sample = samples) + expand("Sambamba/{sample}.markdup.txt", sample = samples) + expand("deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt",sample=samples)+ @@ -74,9 +74,10 @@ def multiqc_input_check(return_value): indir += aligner + " featureCounts " indir += " Sambamba " indir += " deepTools_qc " - if "allelic-mapping" in mode: + if "allelic-mapping" in mode or "allelic-counting" in mode: infiles.append( expand("featureCounts/{sample}.allelic_counts.txt", sample = samples) ) indir += aligner + " featureCounts " + if "allelic-mapping" in mode: infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) ) infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) ) indir += "allelic_bams" From 2ff14b467f6f37adc49c3f36b26ca9d2dc3ab9b3 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 13:17:06 +0200 Subject: [PATCH 56/66] rm dpTqc --- .ci_stuff/test_dag.sh | 2 +- snakePipes/workflows/mRNA-seq/mRNA-seq | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index 5fe7fab19..c8ac0d0a0 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -338,7 +338,7 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi -WC=`mRNA-seq -m allelic-counting,deepTools_qc -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` +WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1408 ]; then exit 1 ; fi diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index 755399773..31467d18c 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -166,6 +166,8 @@ def main(): sys.exit("{} is not a valid mode!\n".format(mode)) if "alignment" not in modeTemp and args.UMIDedup: sys.exit("UMIDedup is only valid for \"alignment\" mode!\n") + if "allelic-counting" in modeTemp and "deepTools_qc" in modeTemp: + sys.exit("Mode deepTools_qc is not compatible with mode allelic-counting.") if args.fromBAM and ("alignment-free" in modeTemp ): sys.exit("\n--fromBAM can only be used with modes \'alignment\' , \'allelic-mapping\' , \'allelic-counting\' or \'deepTools_qc\' - use one of these modes or provide fastq files!\n") if args.fromBAM: From 8cfe3ce5f0c7118c71c855339135d70603dc7d16 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 13:23:34 +0200 Subject: [PATCH 57/66] test dag --- .ci_stuff/test_dag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh index c8ac0d0a0..dce808625 100755 --- a/.ci_stuff/test_dag.sh +++ b/.ci_stuff/test_dag.sh @@ -339,7 +339,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l` -if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1408 ]; then exit 1 ; fi +if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 659 ]; then exit 1 ; fi From e221ebd26f8388b9a755a3afdaac0a78514d7923 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 13:47:14 +0200 Subject: [PATCH 58/66] News.rst and documentation --- docs/content/News.rst | 1 + docs/content/workflows/mRNA-seq.rst | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/docs/content/News.rst b/docs/content/News.rst index be3c759eb..5adf8224b 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -6,6 +6,7 @@ ________________ * added SEACR peaks qc * added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows +* added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap * fixed copyfile command for sampleSheet * removed deprecated --force argument from mamba commands * fixes #998 diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst index e26d52564..a0e6a5cf0 100644 --- a/docs/content/workflows/mRNA-seq.rst +++ b/docs/content/workflows/mRNA-seq.rst @@ -194,6 +194,11 @@ Allele-specific, gene-level differential expression analysis is then performed u .. note:: **allelic-mapping** mode is mutually exclusive with **mapping** mode +"allelic-counting" +~~~~~~~~~~~~~~~~~~ + +**allelic-counting** mode requires the user to input, per sample, 4 bam files, corresponding to haplotype1, haplotype2, unassigned and haplotagged , e.g. as generated by whatshap. The respective suffixes ".genome1", ".genome2", ".unassigned", ".alelle_flagged" are required to be followed by the bam extention ".sorted.bam". This mode is mutually exclusive with "deepTools_qc". Only the allelic version of deepTools qc will be run, by default. Allelic version of featureCounts will be run by default. If sample sheet is provided, allelic DESeq2- or allelic Salmon-based differential gene expression analysis will be run. + "alignment-free" ~~~~~~~~~~~~~~~~ From 6e93d28eb694b4846d93d0dcaf73b160c835e515 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 13:49:00 +0200 Subject: [PATCH 59/66] flake --- snakePipes/common_functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 43d2a6183..abe36b324 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -172,11 +172,12 @@ def get_sample_names_bam(infiles, bamExt): s.append(x) return sorted(list(set(s))) + def get_sample_names_suffix_bam(infiles, bamExt): """ Get sample names without file extensions """ - bamSuff = [x + bamExt for x in [".genome1",".genome2",".unassigned",".allele_flagged"]] + bamSuff = [x + bamExt for x in [".genome1", ".genome2", ".unassigned", ".allele_flagged"]] s = [] for x in infiles: for y in bamSuff: From 209c2bf3f304d95fb325fd118d6536e3fc4e14c8 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 14:06:27 +0200 Subject: [PATCH 60/66] fix aligner for multiqc --- snakePipes/workflows/mRNA-seq/mRNA-seq | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snakePipes/workflows/mRNA-seq/mRNA-seq b/snakePipes/workflows/mRNA-seq/mRNA-seq index 31467d18c..a02f4d012 100755 --- a/snakePipes/workflows/mRNA-seq/mRNA-seq +++ b/snakePipes/workflows/mRNA-seq/mRNA-seq @@ -175,7 +175,9 @@ def main(): if "allelic-counting" in modeTemp and not args.fromBAM: warnings.warn("--fromBAM is required with allelic-counting mode. Setting to True.") args.fromBAM = True - args.aligner = "EXTERNAL_BAM" + if "allelic-counting" in modeTemp: + args.bamExt = ".sorted.bam" + args.aligner = "allelic_bams" if args.rMats and not args.sampleSheet: sys.exit("--rMats flag requires a sampleSheet (specified with --sampleSheet).\n") if "three_prime_seq" in mode: From b60183a46ab675a836e8dce0128b4f259cd600ec Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 14:11:04 +0200 Subject: [PATCH 61/66] fix deseq2 snakemake object --- .../shared/rules/DESeq2.singleComp.snakefile | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 538a0fa85..ae00b05fd 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -19,27 +19,29 @@ rule DESeq2: outdir = get_outdir("DESeq2",sampleSheet), fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), - allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE', + allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode or "allelic-counting" in mode else 'FALSE', tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = config["formula"] + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "{params.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - "{params.formula} " #9 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" +# shell: +# "cd {params.outdir} && " +# "Rscript {params.script} " +# "{input.sampleSheet} " # 1 +# "../{input.counts_table} " # 2 +# "{params.fdr} " # 3 +# "../{input.symbol_file} " # 4 +# "{params.importfunc} " # 5 +# "{params.allele_info} " # 6 +# "{params.tx2gene_file} " # 7 +# "{params.rmdTemplate} " # 8 +# "{params.formula} " #9 +# " > ../{log.out} 2> ../{log.err}" ## DESeq2 (on Salmon) From 6fe2e54e00f0f1d175e7b47828a012b01a46c0c8 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Tue, 9 Apr 2024 14:16:27 +0200 Subject: [PATCH 62/66] fix samplesheet --- snakePipes/shared/rules/DESeq2.singleComp.snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index ae00b05fd..9c27fc8d3 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -15,7 +15,8 @@ rule DESeq2: benchmark: "{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",sampleSheet)) params: - script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"), + script = os.path.join(maindir, "shared", "rscripts", "DESeq2.R"), + sampleSheet = lambda wildcards,input: input.sampleSheet, outdir = get_outdir("DESeq2",sampleSheet), fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), From 7771c4a7ad07a8d42305bdaaf1e6d6ed4f1f37f8 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 10 Apr 2024 11:54:53 +0200 Subject: [PATCH 63/66] deseq2 symbol file --- snakePipes/shared/rscripts/DESeq2.R | 2 +- .../shared/rules/DESeq2.multipleComp.snakefile | 3 ++- .../shared/rules/DESeq2.singleComp.snakefile | 17 ++--------------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R index 9d1255162..d762edd34 100755 --- a/snakePipes/shared/rscripts/DESeq2.R +++ b/snakePipes/shared/rscripts/DESeq2.R @@ -29,7 +29,7 @@ sampleInfoFilePath <- snakemake@params[["sampleSheet"]] countFilePath <- snakemake@params[["counts_table"]] fdr <- as.numeric(snakemake@params[["fdr"]]) -geneNamesFilePath <- snakemake@input[["symbol_file"]] +geneNamesFilePath <- snakemake@params[["symbol_file"]] importfunc <- snakemake@params[["importfunc"]] allelic_info <- as.logical(snakemake@params[["allele_info"]]) tx2gene_file <- snakemake@params[["tx2gene_file"]] diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index 68b13562c..1a32be242 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -36,7 +36,8 @@ rule DESeq2: tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), formula = config["formula"], - counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table) + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index 9c27fc8d3..c62349da9 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -24,26 +24,13 @@ rule DESeq2: tx2gene_file = 'NA', rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), formula = config["formula"], - counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table) + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) log: out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)), err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet)) conda: CONDA_RNASEQ_ENV script: "{params.script}" -# shell: -# "cd {params.outdir} && " -# "Rscript {params.script} " -# "{input.sampleSheet} " # 1 -# "../{input.counts_table} " # 2 -# "{params.fdr} " # 3 -# "../{input.symbol_file} " # 4 -# "{params.importfunc} " # 5 -# "{params.allele_info} " # 6 -# "{params.tx2gene_file} " # 7 -# "{params.rmdTemplate} " # 8 -# "{params.formula} " #9 -# " > ../{log.out} 2> ../{log.err}" - ## DESeq2 (on Salmon) rule DESeq2_Salmon_basic: From 438508189cb2a2e4a6cc45fdcdff4319ca5e2e6f Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 10 Apr 2024 11:57:49 +0200 Subject: [PATCH 64/66] deseq2 salmon --- .../shared/rules/DESeq2.singleComp.snakefile | 41 ++++++------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile index c62349da9..2e506fd5b 100644 --- a/snakePipes/shared/rules/DESeq2.singleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.singleComp.snakefile @@ -52,23 +52,14 @@ rule DESeq2_Salmon_basic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', - tx2gene_file = "Annotation/genes.filtered.t2g", + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = config["formula"] + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - "{params.formula} " # 9 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" + rule DESeq2_Salmon_allelic: input: @@ -89,18 +80,10 @@ rule DESeq2_Salmon_allelic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'TRUE', - tx2gene_file = "Annotation/genes.filtered.t2g", - rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd") + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), + rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{input.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" From 1280216dede0aa7861eba1def5ceaac241d58b78 Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 10 Apr 2024 11:59:12 +0200 Subject: [PATCH 65/66] deseq2 multicomp --- .../rules/DESeq2.multipleComp.snakefile | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile index 1a32be242..afb418e9e 100644 --- a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile +++ b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile @@ -65,20 +65,10 @@ rule DESeq2_Salmon_basic: fdr = fdr, importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"), allele_info = 'FALSE', - tx2gene_file = "Annotation/genes.filtered.t2g", + tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"), rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"), - formula = config["formula"] + formula = config["formula"], + counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table), + symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file) conda: CONDA_RNASEQ_ENV - shell: - "cd {params.outdir} && " - "Rscript {params.script} " - "{params.sampleSheet} " # 1 - "../{input.counts_table} " # 2 - "{params.fdr} " # 3 - "../{input.symbol_file} " # 4 - "{params.importfunc} " # 5 - "{params.allele_info} " # 6 - "../{input.tx2gene_file} " # 7 - "{params.rmdTemplate} " # 8 - "{params.formula} " - " > ../{log.out} 2> ../{log.err}" + script: "{params.script}" From 94d9f3602cbe10998d036e399aaf2b0ffc9304ec Mon Sep 17 00:00:00 2001 From: "katarzyna.otylia.sikora@gmail.com" Date: Wed, 10 Apr 2024 12:44:37 +0200 Subject: [PATCH 66/66] bumped versions and docs --- conda-recipe/meta.yaml | 2 +- docs/content/News.rst | 8 +++++--- docs/content/workflows/mRNA-seq.rst | 3 +++ snakePipes/__init__.py | 2 +- snakePipes/shared/defaults.yaml | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index d7538294e..f9aabba44 100755 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,6 +1,6 @@ package: name: snakepipes - version: 2.8.1 + version: 2.8.2 source: path: ../ diff --git a/docs/content/News.rst b/docs/content/News.rst index 5adf8224b..fcbefb31e 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -1,19 +1,21 @@ snakePipes News =============== -snakePipes x.y.z +snakePipes 2.8.2 ________________ * added SEACR peaks qc * added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows * added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap +* added support for custom model formula to mRNA-seq workflow * fixed copyfile command for sampleSheet * removed deprecated --force argument from mamba commands * fixes #998 * fixes #997 * fixes #996 - - +* fixes #994 +* fixes #1000 +* fixes #1001 snakePipes 2.8.1 ---------------- diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst index a0e6a5cf0..4393ea282 100644 --- a/docs/content/workflows/mRNA-seq.rst +++ b/docs/content/workflows/mRNA-seq.rst @@ -144,6 +144,9 @@ Like the other workflows, differential expression can be performed using the ``- .. note:: The first entry defines which group of samples are control. This way, the order of comparison and likewise the sign of values can be changed. The DE analysis might fail if your sample names begin with a number. So watch out for that! + +Optionally, the user may submit their desired model formula (without the leading tilda ``~``) with ``--formula``. + Differential Splicing --------------------- diff --git a/snakePipes/__init__.py b/snakePipes/__init__.py index 80e22f7ae..964a32ab4 100755 --- a/snakePipes/__init__.py +++ b/snakePipes/__init__.py @@ -1 +1 @@ -__version__ = '2.8.1' +__version__ = '2.8.2' diff --git a/snakePipes/shared/defaults.yaml b/snakePipes/shared/defaults.yaml index 1975203cb..f35badc8a 100755 --- a/snakePipes/shared/defaults.yaml +++ b/snakePipes/shared/defaults.yaml @@ -7,7 +7,7 @@ # permitted here. ################################################################################ # -condaEnvDir: '/package/mamba/envs/snakepipesenvs/2.8.1' +condaEnvDir: '/package/mamba/envs/snakepipesenvs/2.8.2' snakemakeOptions: '' organismsDir: 'shared/organisms' clusterConfig: 'shared/cluster.yaml'