Merge pull request #879 from maxplanck-ie/develop

Develop
maxplanck-ie · Jan 25, 2023 · 27cfcbc · 27cfcbc
2 parents 689f8b2 + ac3dffd
commit 27cfcbc
Show file tree

Hide file tree

Showing 12 changed files with 33 additions and 24 deletions.
diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -1,6 +1,14 @@
 snakePipes News
 ===============
 
+snakePipes 2.7.2
+----------------
+* STAR version has been updated to 2.7.10b. 2.7.10a was returning segmentation fault on MAC.
+* STAR command has been updated. Now, STAR itself offers a command line option for processing input files.
+* Put a cap on python version for the deeptools env. The current version of deeptools is not supporting the newer python versions and some tools fail.
+* Update default condaDir.
+* The filter_gtf function has become a bit more versatile. GTF files that include delimiters (';') in e.g. a description field are now allowed. Gene names are also allowed to have symbols now. Lastly, GTF files that have xRNA instead of transcript as a feature in column 3 can also be parsed.
+
 snakePipes 2.7.1
 ----------------
 * Capped snakemake version to < 7.19.0.
@@ -64,9 +72,9 @@ snakePipes 2.5.1
 * Updated Bowtie2 parameters for the cut and tag data
 * Updated multibamSummary in ChIPSeq pipeline for data with spike-in
 * Uncommented the BamCompare for ChIPseq pipeline with spike-in
-* set a max thread for each rule 
-* External PR #764: always check for >0 when generating heatmap for the differential analysis done by deseq2 
-  
+* set a max thread for each rule
+* External PR #764: always check for >0 when generating heatmap for the differential analysis done by deseq2
+
 snakePipes 2.5.0
 ----------------
 

diff --git a/docs/content/running_snakePipes.rst b/docs/content/running_snakePipes.rst
@@ -35,7 +35,7 @@ All individual jobs of the workflow will be submitted to the Grid engine using t
 
 **To run the workflow locally**, use the parameter ``--local`` for local mode and the parameter ``-j 10`` to specify the maximal number of used CPU threads (here: 10).
 
-**For single-end FASTQ files**, the workflow automatically recognized single suffix (eg. "sample1.fastq" instead of "sample1_R1.fastq") as single-end reads. However, mixing of single and paired-end files in the same folder is not supported currently.
+**For single-end FASTQ files**, Note that single end data still needs a valid suffix (e.g. sample1_R1.fastq.gz). With a proper suffix, single end mode is detected by default. When executing some workflows with the ``--fromBAM`` flag, it is still necessary to set ``--singleEnd``.
 
 Once the DNA-mapping run is finished sucessfully. We can run the ChIP-seq analysis in the same directory.
 

diff --git a/snakePipes/shared/defaults.yaml b/snakePipes/shared/defaults.yaml
@@ -7,7 +7,7 @@
 # permitted here.
 ################################################################################
 #
-snakemakeOptions: ' --use-conda --conda-prefix /package/anaconda3/envs/ '
+snakemakeOptions: ' --use-conda --conda-prefix /package/mamba/envs/ '
 organismsDir: 'shared/organisms'
 clusterConfig: 'shared/cluster.yaml'
 tempDir: /data/extended/

diff --git a/snakePipes/shared/organisms/mm10_gencodeM19.yaml b/snakePipes/shared/organisms/mm10_gencodeM19.yaml
@@ -13,6 +13,6 @@ star_index: "/data/repository/organisms/GRCm38_ensembl/STARIndex/2.7.10a/"
 genes_bed: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.bed"
 genes_gtf: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.gtf"
 extended_coding_regions_gtf: "/data/repository/organisms/GRCm38_ensembl/gencode/m19/genes.slop.gtf"
-blacklist_bed: "/data/repository/organisms/GRCm38_ensembl/DKFZ/GRCm38_General_readAttractingRegions.bed"
+blacklist_bed: "/data/repository/organisms/GRCm38_ensembl/DKFZ/GRCm38_General_readAttractingRegions.UseThisOne.bed"
 ignoreForNormalization: "MT X Y JH584299.1 GL456233.1 JH584301.1 GL456211.1 GL456350.1 JH584293.1 GL456221.1 JH584297.1 JH584296.1 GL456354.1 JH584294.1 JH584298.1 JH584300.1 GL456219.1 GL456210.1 JH584303.1 JH584302.1 GL456212.1 JH584304.1 GL456379.1 GL456216.1 GL456393.1 GL456366.1 GL456367.1 GL456239.1 GL456213.1 GL456383.1 GL456385.1 GL456360.1 GL456378.1 GL456389.1 GL456372.1 GL456370.1 GL456381.1 GL456387.1 GL456390.1 GL456394.1 GL456392.1 GL456382.1 GL456359.1 GL456396.1 GL456368.1 JH584292.1 JH584295.1"
 rmsk_file: '/data/repository/organisms/GRCm38_ensembl/UCSC/mm10/rmsk.txt'
diff --git a/snakePipes/shared/rscripts/merge_featureCounts.R b/snakePipes/shared/rscripts/merge_featureCounts.R
@@ -16,7 +16,7 @@ isallelic <- function(x) {
 get_df <- function(infile) {
   cat(infile, "\n")
   bname = gsub(".counts.txt" , "" , basename(infile) )
-  df = read.table(infile, header=T)
+  df = read.table(infile, header=T, sep='\t')
 
   if(isallelic(df) == TRUE) {
   print("Counts are allele-specific")

diff --git a/snakePipes/shared/rules/RNA_mapping.snakefile b/snakePipes/shared/rules/RNA_mapping.snakefile
@@ -123,7 +123,8 @@ elif aligner.upper().find("STAR") >=0:
                     --outStd BAM_Unsorted \
                     --sjdbGTFfile {params.gtf} \
                     --genomeDir {params.index} \
-                    --readFilesIn <(gunzip -c {input.r1}) <(gunzip -c {input.r2}) \
+                    --readFilesIn  {input.r1} {input.r2} \
+                    --readFilesCommand 'gunzip -c' \
                     --outFileNamePrefix {params.prefix} \
                 | samtools sort -m {params.samsort_memory} -T $MYTEMP/{wildcards.sample} -@ {params.samtools_threads} -O bam -o {output.bam} - 2> {log}
                 rm -rf $MYTEMP
@@ -160,7 +161,8 @@ elif aligner.upper().find("STAR") >=0:
                     --outStd BAM_Unsorted \
                     --sjdbGTFfile {params.gtf} \
                     --genomeDir {params.index} \
-                    --readFilesIn <(gunzip -c {input}) \
+                    --readFilesIn {input} \
+                    --readFilesCommand 'gunzip -c' \
                     --outFileNamePrefix {params.prefix} \
                 | samtools sort -m {params.samsort_memory} -T $MYTEMP/{wildcards.sample} -@ {params.samtools_threads} -O bam -o {output.bam} - 2> {log}
                 rm -rf $MYTEMP

diff --git a/snakePipes/shared/rules/envs/createIndices.yaml b/snakePipes/shared/rules/envs/createIndices.yaml
@@ -8,7 +8,7 @@ dependencies:
  - ucsc-genepredtobed
  - bowtie2 = 2.4.5
  - hisat2 = 2.2.1
- - star = 2.7.10a
+ - star = 2.7.10b
  - bwa = 0.7.17
  - bwa-mem2 = 2.2.1
  - bwameth = 0.2.5

diff --git a/snakePipes/shared/rules/envs/rna_seq.yaml b/snakePipes/shared/rules/envs/rna_seq.yaml
@@ -8,7 +8,7 @@ dependencies:
  - samtools = 1.15.1
  - subread = 2.0.1
  - hisat2 = 2.2.1
- - star = 2.7.10a
+ - star = 2.7.10b
  - salmon = 1.9.0
  - r-base = 4.1.3
  - r-wasabi

diff --git a/snakePipes/shared/rules/envs/sc_rna_seq.yaml b/snakePipes/shared/rules/envs/sc_rna_seq.yaml
@@ -5,12 +5,12 @@ channels:
 dependencies:
  - umi_tools = 1.1.2
  - samtools = 1.15.1
- - star = 2.7.10a
+ - star = 2.7.10b
  - r-base = 4.1.3
  - r-stringr
  - r-data.table
- - r-seurat = 4.1.1 
+ - r-seurat = 4.1.1
  - r-gtools
  - r-dplyr
  - h5py = 3.6.0
- - velocyto.py = 0.17.17 
+ - velocyto.py = 0.17.17
diff --git a/snakePipes/shared/rules/envs/shared.yaml b/snakePipes/shared/rules/envs/shared.yaml
@@ -3,6 +3,7 @@ channels:
  - conda-forge
  - bioconda
 dependencies:
+ - python = 3.7.12 #This can be changed together with the update of deeptools to the next version
  - deeptools = 3.5.1
  - seqtk = 1.3
  - pigz = 2.6

diff --git a/snakePipes/shared/rules/filterGTF.snakefile b/snakePipes/shared/rules/filterGTF.snakefile
@@ -38,6 +38,7 @@ rule gtf_to_files:
         "Annotation/genes.filtered.bed"
     run:
         import shlex
+        import re
 
         t2g = open(output[0], "w")
         symbol = open(output[1], "w")
@@ -47,9 +48,9 @@ rule gtf_to_files:
             if line.startswith("#"):
                 continue
             cols = line.strip().split("\t")
+            annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) 
             if cols[2] == "gene":
                 # get the gene_name and gene_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 gene_name = None
                 for anno in annos:
@@ -62,9 +63,8 @@ rule gtf_to_files:
                         gene_name = anno[1]
                 if gene_id:
                     symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name))
-            elif cols[2] == "transcript":
+            elif cols[2] == "transcript" or 'RNA' in cols[2]:
                 # get the gene_id and transcript_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 transcript_id = None
                 gene_name = ""
@@ -84,15 +84,14 @@ rule gtf_to_files:
                     GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []]
             elif cols[2] == "exon":
                 # get the transcript_id
-                annos = cols[8].split(";")
                 transcript_id = None
                 for anno in annos:
                     anno = shlex.split(anno.strip(), " ")
                     if len(anno) == 0:
                         continue
                     if anno[0] == "transcript_id":
                         transcript_id = anno[1]
-                if transcript_id:
+                if transcript_id and transcript_id in GTFdict:
                     exonWidth = int(cols[4]) - int(cols[3]) + 1
                     exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1])
                     GTFdict[transcript_id][4].append(str(exonWidth))

diff --git a/snakePipes/shared/rules/filterGTF_spikein.snakefile b/snakePipes/shared/rules/filterGTF_spikein.snakefile
@@ -38,6 +38,7 @@ rule spikein_gtf_to_files:
         "Annotation_spikein/genes.filtered.bed"
     run:
         import shlex
+        import re
 
         t2g = open(output[0], "w")
         symbol = open(output[1], "w")
@@ -47,9 +48,9 @@ rule spikein_gtf_to_files:
             if line.startswith("#"):
                 continue
             cols = line.strip().split("\t")
+            annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) 
             if cols[2] == "gene":
                 # get the gene_name and gene_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 gene_name = None
                 for anno in annos:
@@ -62,9 +63,8 @@ rule spikein_gtf_to_files:
                         gene_name = anno[1]
                 if gene_id:
                     symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name))
-            elif cols[2] == "transcript":
+            elif cols[2] == "transcript" or 'RNA' in cols[2]:
                 # get the gene_id and transcript_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 transcript_id = None
                 gene_name = ""
@@ -84,15 +84,14 @@ rule spikein_gtf_to_files:
                     GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []]
             elif cols[2] == "exon":
                 # get the transcript_id
-                annos = cols[8].split(";")
                 transcript_id = None
                 for anno in annos:
                     anno = shlex.split(anno.strip(), " ")
                     if len(anno) == 0:
                         continue
                     if anno[0] == "transcript_id":
                         transcript_id = anno[1]
-                if transcript_id:
+                if transcript_id and transcript_id in GTFdict:
                     exonWidth = int(cols[4]) - int(cols[3]) + 1
                     exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1])
                     GTFdict[transcript_id][4].append(str(exonWidth))