diff --git a/docs/content/News.rst b/docs/content/News.rst index 5c5ba0905..b518e4e2b 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -1,6 +1,14 @@ snakePipes News =============== +snakePipes 2.7.2 +---------------- +* STAR version has been updated to 2.7.10b. 2.7.10a was returning segmentation fault on MAC. +* STAR command has been updated. Now, STAR itself offers a command line option for processing input files. +* Put a cap on python version for the deeptools env. The current version of deeptools is not supporting the newer python versions and some tools fail. +* Update default condaDir. +* The filter_gtf function has become a bit more versatile. GTF files that include delimiters (';') in e.g. a description field are now allowed. Gene names are also allowed to have symbols now. Lastly, GTF files that have xRNA instead of transcript as a feature in column 3 can also be parsed. + snakePipes 2.7.1 ---------------- * Capped snakemake version to < 7.19.0. @@ -64,9 +72,9 @@ snakePipes 2.5.1 * Updated Bowtie2 parameters for the cut and tag data * Updated multibamSummary in ChIPSeq pipeline for data with spike-in * Uncommented the BamCompare for ChIPseq pipeline with spike-in -* set a max thread for each rule -* External PR #764: always check for >0 when generating heatmap for the differential analysis done by deseq2 - +* set a max thread for each rule +* External PR #764: always check for >0 when generating heatmap for the differential analysis done by deseq2 + snakePipes 2.5.0 ---------------- diff --git a/docs/content/running_snakePipes.rst b/docs/content/running_snakePipes.rst index b8d2648fa..0e38bb89a 100644 --- a/docs/content/running_snakePipes.rst +++ b/docs/content/running_snakePipes.rst @@ -35,7 +35,7 @@ All individual jobs of the workflow will be submitted to the Grid engine using t **To run the workflow locally**, use the parameter ``--local`` for local mode and the parameter ``-j 10`` to specify the maximal number of used CPU threads (here: 10). -**For single-end FASTQ files**, the workflow automatically recognized single suffix (eg. "sample1.fastq" instead of "sample1_R1.fastq") as single-end reads. However, mixing of single and paired-end files in the same folder is not supported currently. +**For single-end FASTQ files**, Note that single end data still needs a valid suffix (e.g. sample1_R1.fastq.gz). With a proper suffix, single end mode is detected by default. When executing some workflows with the ``--fromBAM`` flag, it is still necessary to set ``--singleEnd``. Once the DNA-mapping run is finished sucessfully. We can run the ChIP-seq analysis in the same directory. diff --git a/snakePipes/shared/defaults.yaml b/snakePipes/shared/defaults.yaml index 61d34ffbf..489d5299b 100755 --- a/snakePipes/shared/defaults.yaml +++ b/snakePipes/shared/defaults.yaml @@ -7,7 +7,7 @@ # permitted here. ################################################################################ # -snakemakeOptions: ' --use-conda --conda-prefix /package/anaconda3/envs/ ' +snakemakeOptions: ' --use-conda --conda-prefix /package/mamba/envs/ ' organismsDir: 'shared/organisms' clusterConfig: 'shared/cluster.yaml' tempDir: /data/extended/ diff --git a/snakePipes/shared/organisms/mm10_gencodeM19.yaml b/snakePipes/shared/organisms/mm10_gencodeM19.yaml index 9a1ec92a3..a09212538 100644 --- a/snakePipes/shared/organisms/mm10_gencodeM19.yaml +++ b/snakePipes/shared/organisms/mm10_gencodeM19.yaml @@ -13,6 +13,6 @@ star_index: "/data/repository/organisms/GRCm38_ensembl/STARIndex/2.7.10a/" genes_bed: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.bed" genes_gtf: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.gtf" extended_coding_regions_gtf: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.slop.gtf" -blacklist_bed: "/data/repository/organisms/GRCm38_ensembl/DKFZ/GRCm38_General_readAttractingRegions.bed" +blacklist_bed: "/data/repository/organisms/GRCm38_ensembl/DKFZ/GRCm38_General_readAttractingRegions.UseThisOne.bed" ignoreForNormalization: "MT X Y JH584299.1 GL456233.1 JH584301.1 GL456211.1 GL456350.1 JH584293.1 GL456221.1 JH584297.1 JH584296.1 GL456354.1 JH584294.1 JH584298.1 JH584300.1 GL456219.1 GL456210.1 JH584303.1 JH584302.1 GL456212.1 JH584304.1 GL456379.1 GL456216.1 GL456393.1 GL456366.1 GL456367.1 GL456239.1 GL456213.1 GL456383.1 GL456385.1 GL456360.1 GL456378.1 GL456389.1 GL456372.1 GL456370.1 GL456381.1 GL456387.1 GL456390.1 GL456394.1 GL456392.1 GL456382.1 GL456359.1 GL456396.1 GL456368.1 JH584292.1 JH584295.1" rmsk_file: '/data/repository/organisms/GRCm38_ensembl/UCSC/mm10/rmsk.txt' diff --git a/snakePipes/shared/rscripts/merge_featureCounts.R b/snakePipes/shared/rscripts/merge_featureCounts.R index 32b20d3ea..22480ca24 100644 --- a/snakePipes/shared/rscripts/merge_featureCounts.R +++ b/snakePipes/shared/rscripts/merge_featureCounts.R @@ -16,7 +16,7 @@ isallelic <- function(x) { get_df <- function(infile) { cat(infile, "\n") bname = gsub(".counts.txt" , "" , basename(infile) ) - df = read.table(infile, header=T) + df = read.table(infile, header=T, sep='\t') if(isallelic(df) == TRUE) { print("Counts are allele-specific") diff --git a/snakePipes/shared/rules/RNA_mapping.snakefile b/snakePipes/shared/rules/RNA_mapping.snakefile index 078850221..50d98ff1b 100755 --- a/snakePipes/shared/rules/RNA_mapping.snakefile +++ b/snakePipes/shared/rules/RNA_mapping.snakefile @@ -123,7 +123,8 @@ elif aligner.upper().find("STAR") >=0: --outStd BAM_Unsorted \ --sjdbGTFfile {params.gtf} \ --genomeDir {params.index} \ - --readFilesIn <(gunzip -c {input.r1}) <(gunzip -c {input.r2}) \ + --readFilesIn {input.r1} {input.r2} \ + --readFilesCommand 'gunzip -c' \ --outFileNamePrefix {params.prefix} \ | samtools sort -m {params.samsort_memory} -T $MYTEMP/{wildcards.sample} -@ {params.samtools_threads} -O bam -o {output.bam} - 2> {log} rm -rf $MYTEMP @@ -160,7 +161,8 @@ elif aligner.upper().find("STAR") >=0: --outStd BAM_Unsorted \ --sjdbGTFfile {params.gtf} \ --genomeDir {params.index} \ - --readFilesIn <(gunzip -c {input}) \ + --readFilesIn {input} \ + --readFilesCommand 'gunzip -c' \ --outFileNamePrefix {params.prefix} \ | samtools sort -m {params.samsort_memory} -T $MYTEMP/{wildcards.sample} -@ {params.samtools_threads} -O bam -o {output.bam} - 2> {log} rm -rf $MYTEMP diff --git a/snakePipes/shared/rules/envs/createIndices.yaml b/snakePipes/shared/rules/envs/createIndices.yaml index 8531e54c4..2fe5b66e4 100755 --- a/snakePipes/shared/rules/envs/createIndices.yaml +++ b/snakePipes/shared/rules/envs/createIndices.yaml @@ -8,7 +8,7 @@ dependencies: - ucsc-genepredtobed - bowtie2 = 2.4.5 - hisat2 = 2.2.1 - - star = 2.7.10a + - star = 2.7.10b - bwa = 0.7.17 - bwa-mem2 = 2.2.1 - bwameth = 0.2.5 diff --git a/snakePipes/shared/rules/envs/rna_seq.yaml b/snakePipes/shared/rules/envs/rna_seq.yaml index db4437bb7..aec33ce47 100755 --- a/snakePipes/shared/rules/envs/rna_seq.yaml +++ b/snakePipes/shared/rules/envs/rna_seq.yaml @@ -8,7 +8,7 @@ dependencies: - samtools = 1.15.1 - subread = 2.0.1 - hisat2 = 2.2.1 - - star = 2.7.10a + - star = 2.7.10b - salmon = 1.9.0 - r-base = 4.1.3 - r-wasabi diff --git a/snakePipes/shared/rules/envs/sc_rna_seq.yaml b/snakePipes/shared/rules/envs/sc_rna_seq.yaml index 45557742c..35539cd6e 100644 --- a/snakePipes/shared/rules/envs/sc_rna_seq.yaml +++ b/snakePipes/shared/rules/envs/sc_rna_seq.yaml @@ -5,12 +5,12 @@ channels: dependencies: - umi_tools = 1.1.2 - samtools = 1.15.1 - - star = 2.7.10a + - star = 2.7.10b - r-base = 4.1.3 - r-stringr - r-data.table - - r-seurat = 4.1.1 + - r-seurat = 4.1.1 - r-gtools - r-dplyr - h5py = 3.6.0 - - velocyto.py = 0.17.17 + - velocyto.py = 0.17.17 diff --git a/snakePipes/shared/rules/envs/shared.yaml b/snakePipes/shared/rules/envs/shared.yaml index 9022c6d2a..fe22c5e10 100755 --- a/snakePipes/shared/rules/envs/shared.yaml +++ b/snakePipes/shared/rules/envs/shared.yaml @@ -3,6 +3,7 @@ channels: - conda-forge - bioconda dependencies: + - python = 3.7.12 #This can be changed together with the update of deeptools to the next version - deeptools = 3.5.1 - seqtk = 1.3 - pigz = 2.6 diff --git a/snakePipes/shared/rules/filterGTF.snakefile b/snakePipes/shared/rules/filterGTF.snakefile index 40684fb19..a0820999a 100644 --- a/snakePipes/shared/rules/filterGTF.snakefile +++ b/snakePipes/shared/rules/filterGTF.snakefile @@ -38,6 +38,7 @@ rule gtf_to_files: "Annotation/genes.filtered.bed" run: import shlex + import re t2g = open(output[0], "w") symbol = open(output[1], "w") @@ -47,9 +48,9 @@ rule gtf_to_files: if line.startswith("#"): continue cols = line.strip().split("\t") + annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) if cols[2] == "gene": # get the gene_name and gene_id values - annos = cols[8].split(";") gene_id = None gene_name = None for anno in annos: @@ -62,9 +63,8 @@ rule gtf_to_files: gene_name = anno[1] if gene_id: symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name)) - elif cols[2] == "transcript": + elif cols[2] == "transcript" or 'RNA' in cols[2]: # get the gene_id and transcript_id values - annos = cols[8].split(";") gene_id = None transcript_id = None gene_name = "" @@ -84,7 +84,6 @@ rule gtf_to_files: GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []] elif cols[2] == "exon": # get the transcript_id - annos = cols[8].split(";") transcript_id = None for anno in annos: anno = shlex.split(anno.strip(), " ") @@ -92,7 +91,7 @@ rule gtf_to_files: continue if anno[0] == "transcript_id": transcript_id = anno[1] - if transcript_id: + if transcript_id and transcript_id in GTFdict: exonWidth = int(cols[4]) - int(cols[3]) + 1 exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1]) GTFdict[transcript_id][4].append(str(exonWidth)) diff --git a/snakePipes/shared/rules/filterGTF_spikein.snakefile b/snakePipes/shared/rules/filterGTF_spikein.snakefile index 6ef5b5602..63cdf88a7 100644 --- a/snakePipes/shared/rules/filterGTF_spikein.snakefile +++ b/snakePipes/shared/rules/filterGTF_spikein.snakefile @@ -38,6 +38,7 @@ rule spikein_gtf_to_files: "Annotation_spikein/genes.filtered.bed" run: import shlex + import re t2g = open(output[0], "w") symbol = open(output[1], "w") @@ -47,9 +48,9 @@ rule spikein_gtf_to_files: if line.startswith("#"): continue cols = line.strip().split("\t") + annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) if cols[2] == "gene": # get the gene_name and gene_id values - annos = cols[8].split(";") gene_id = None gene_name = None for anno in annos: @@ -62,9 +63,8 @@ rule spikein_gtf_to_files: gene_name = anno[1] if gene_id: symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name)) - elif cols[2] == "transcript": + elif cols[2] == "transcript" or 'RNA' in cols[2]: # get the gene_id and transcript_id values - annos = cols[8].split(";") gene_id = None transcript_id = None gene_name = "" @@ -84,7 +84,6 @@ rule spikein_gtf_to_files: GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []] elif cols[2] == "exon": # get the transcript_id - annos = cols[8].split(";") transcript_id = None for anno in annos: anno = shlex.split(anno.strip(), " ") @@ -92,7 +91,7 @@ rule spikein_gtf_to_files: continue if anno[0] == "transcript_id": transcript_id = anno[1] - if transcript_id: + if transcript_id and transcript_id in GTFdict: exonWidth = int(cols[4]) - int(cols[3]) + 1 exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1]) GTFdict[transcript_id][4].append(str(exonWidth))