Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1. Added the samplesheet to the pipeline output as `OUTDIR/samplesheet.csv`
2. Added the `--bedpe` parameter. This makes the pipeline output BEDPE files alongside the VCF files.
3. Added parallelization on SV type to the delly flow
4. Added a `--gtf` parameter for annotation of gene and transcript overlap using `gatk SVAnnotate`.

### `Changes`

Expand Down
4 changes: 3 additions & 1 deletion assets/svync/delly.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
id: delly_$INFO/SVTYPE
alt:
BND: TRA
alts:
BND: <TRA>
value: <$INFO/SVTYPE>
info:
CALLERS:
value: delly
Expand Down
2 changes: 2 additions & 0 deletions assets/svync/manta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
id: manta_$INFO/SVTYPE
alt:
value: <$INFO/SVTYPE>
info:
CALLERS:
value: manta
Expand Down
76 changes: 76 additions & 0 deletions bin/preprocess_gtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python
# https://github.com/broadinstitute/gatk-sv/blob/main/scripts/inputs/preprocess_gtf.py

"""
Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation.
"""

import argparse
import gzip


CHROM_FIELD = 0
ELEMENT_FIELD = 2
ATTRIBUTES_FIELD = 8
TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"}
CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"}


# Flexibly open .gz or uncompressed file to read
def _open(filename):
if filename.endswith(".gz"):
return gzip.open(filename, 'rt')
else:
return open(filename, 'r')


# Extract transcript type and canonical status
def parse_attributes(field):
# format: key1 "value1"; key2 "value2";
# keys may be repeated so cannot convert directly to dictionary
attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")]
protein = False
canonical = False
for key, val in attributes_list:
if key == "tag" and val in CANONICAL:
canonical = True
elif key == "transcript_type" and val in TRANSCRIPT_TYPES:
protein = True
return protein, canonical


def process(gtf, outfile):
with _open(gtf) as inp, open(outfile, 'w') as out:
gene_line = ""
for line in inp:
if line.startswith("#"):
continue
fields = line.rstrip('\n').split('\t')

# Drop mitochondria
if fields[CHROM_FIELD] == 'chrM':
continue

# Store gene line to print if transcript is eligible
if fields[ELEMENT_FIELD] == "gene":
gene_line = line
continue

# Select protein-coding and canonical transcripts only
protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD])
if protein and canonical:
out.write(gene_line + line)
gene_line = "" # only print gene line before first transcript line


def main():
parser = argparse.ArgumentParser()
parser.add_argument('gtf', help="Input GTF from GENCODE")
parser.add_argument('outfile', help="Output filename")
args = parser.parse_args()

process(args.gtf, args.outfile)


if __name__ == '__main__':
main()
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,10 @@ process {
ext.args = "-ends"
}

withName: "^.*GATK4_SVANNOTATE\$" {
ext.prefix = {"${meta.id}.${meta.variant_type}.svannotate"}
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SV AND CNV FILTERING
Expand Down
2 changes: 2 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ params {
// Fasta references
fasta = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta"
fai = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta.fai"
dict = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.dict"
gtf = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.gtf"
// bwa = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/bwa.tar.gz"
expansionhunter_catalog = params.test_data["homo_sapiens"]["genome"]["expansionhunter"]
qdnaseq_male = params.test_data["homo_sapiens"]["genome"]["genome_qdnaseq"]
Expand Down
4 changes: 4 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_stru

params.fasta = getGenomeAttribute('fasta')
params.fai = getGenomeAttribute('fai')
params.dict = getGenomeAttribute('dict')
params.gtf = getGenomeAttribute('gtf')
params.vep_cache = getGenomeAttribute('vep_cache')
// params.bwa = getGenomeAttribute('bwa')
params.annotsv_annotations = getGenomeAttribute('annotsv_annotations')
Expand Down Expand Up @@ -81,6 +83,8 @@ workflow {
// files
params.fasta,
params.fai,
params.dict,
params.gtf,
params.expansionhunter_catalog ?: "https://github.com/Illumina/ExpansionHunter/raw/master/variant_catalog/grch38/variant_catalog.json",
params.qdnaseq_female,
params.qdnaseq_male,
Expand Down
16 changes: 13 additions & 3 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
"nf-core": {
"annotsv/annotsv": {
"branch": "master",
"git_sha": "a94ad45fa5b350961c374c46f79bc86cd5853353",
"git_sha": "296d216c3f6384936a6526b6fbed7e6412259fb4",
"installed_by": ["modules"]
},
"annotsv/installannotations": {
"branch": "master",
"git_sha": "a94ad45fa5b350961c374c46f79bc86cd5853353",
"git_sha": "296d216c3f6384936a6526b6fbed7e6412259fb4",
"installed_by": ["modules"]
},
"bcftools/annotate": {
Expand Down Expand Up @@ -66,6 +66,16 @@
"git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
"installed_by": ["modules"]
},
"gatk4/createsequencedictionary": {
"branch": "master",
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
"installed_by": ["modules"]
},
"gatk4/svannotate": {
"branch": "master",
"git_sha": "cc7e281e7877146dac79c5a484e6e2b10086234a",
"installed_by": ["modules"]
},
"gawk": {
"branch": "master",
"git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
Expand Down Expand Up @@ -135,7 +145,7 @@
},
"svync": {
"branch": "master",
"git_sha": "916a4cbc4f831d501860495b157c4857833e22a7",
"git_sha": "0fc190096fa8dcc9878cef178479f22e03f174a1",
"installed_by": ["modules"]
},
"tabix/bgziptabix": {
Expand Down
5 changes: 5 additions & 0 deletions modules/local/preprocess_gtf/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::python=3.13.5
40 changes: 40 additions & 0 deletions modules/local/preprocess_gtf/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process PREPROCESS_GTF {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8a/8ad257d53c2a2b8810d2b12d4d8e3ea438bc8c4a6be7c39b0354cd7bb8d5c260/data':
'community.wave.seqera.io/library/python:3.13.5--18032a8dc5d4b91e' }"

input:
tuple val(meta), path(gtf)

output:
tuple val(meta), path("*.sanitized.gtf"), emit: gtf
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${gtf.baseName}"

"""
preprocess_gtf.py $gtf ${prefix}.sanitized.gtf

cat <<-END_VERSIONS > versions.yml
"${task.process}":
grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${gtf.baseName}"

"""
touch ${prefix}.sanitized.gtf

cat <<-END_VERSIONS > versions.yml
"${task.process}":
grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
END_VERSIONS
"""
}
45 changes: 0 additions & 45 deletions modules/nf-core/annotsv/annotsv/annotsv-annotsv.diff

This file was deleted.

6 changes: 3 additions & 3 deletions modules/nf-core/annotsv/annotsv/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 13 additions & 12 deletions modules/nf-core/annotsv/annotsv/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading