Skip to content

Commit

Permalink
Add mix and match functionality for mutation calling (resolves #265)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidstew committed May 13, 2018
1 parent 101565a commit f1e1f69
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 21 deletions.
3 changes: 2 additions & 1 deletion src/protect/mutation_annotation/snpeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):
:return: fsID for the snpeffed vcf
:rtype: toil.fileStore.FileID
"""
if merged_mutation_file is None:
return None
work_dir = os.getcwd()
input_files = {
'merged_mutations.vcf': merged_mutation_file,
'snpeff_index.tar.gz': snpeff_options['index']}
input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir)
input_files = {key: docker_path(path) for key, path in input_files.items()}

parameters = ['eff',
'-dataDir', input_files['snpeff_index'],
'-c', '/'.join([input_files['snpeff_index'],
Expand Down
44 changes: 31 additions & 13 deletions src/protect/mutation_calling/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,28 @@ def run_mutation_aggregator(job, mutation_results, univ_options):
"""
# Setup an input data structure for the merge function
out = {}
for chrom in mutation_results['mutect'].keys():
out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
univ_options).rv()
merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
return merged_snvs.rv()
chroms = {}
for caller in mutation_results:
if mutation_results[caller] is None:
continue
else:
if caller == 'strelka':
if mutation_results['strelka']['snvs'] is None:
continue
chroms = mutation_results['strelka']['snvs'].keys()
else:
chroms = mutation_results[caller].keys()
break
if chroms:
for chrom in chroms:
out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
univ_options).rv()
merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
return merged_snvs.rv()
else:
return None



def merge_perchrom_mutations(job, chrom, mutations, univ_options):
Expand Down Expand Up @@ -105,25 +121,27 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
# For now, let's just say 2 out of n need to call it.
# num_preds = len(mutations)
# majority = int((num_preds + 0.5) / 2)
majority = {'snvs': 2,
'indels': 1}

accepted_hits = defaultdict(dict)

for mut_type in vcf_processor.keys():
# Get input files
perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom],
work_dir, univ_options)
for caller in vcf_processor[mut_type]}
for caller in vcf_processor[mut_type]
if mutations[caller] is not None}
# Process the strelka key
perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
perchrom_mutations.pop('strelka_' + mut_type)
if 'strelka' + mut_type in perchrom_mutations:
perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
perchrom_mutations.pop('strelka_' + mut_type)
if not perchrom_mutations:
continue
majority = 1 if len(perchrom_mutations) <= 2 else len(perchrom_mutations) / 2
# Read in each file to a dict
vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
all_positions = list(set(itertools.chain(*vcf_lists.values())))
for position in sorted(all_positions):
hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()}
if sum(hits.values()) >= majority[mut_type]:
if sum(hits.values()) >= majority:
callers = ','.join([caller for caller, hit in hits.items() if hit])
assert position[1] not in accepted_hits[position[0]]
accepted_hits[position[0]][position[1]] = (position[2], position[3], callers)
Expand Down
4 changes: 3 additions & 1 deletion src/protect/mutation_calling/indel.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options):
:return: fsID to the merged fusion calls
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.')
job.fileStore.logToMaster('INDELs currently occur only through strelka.')
if indel_options['run'] is False:
return None
indel_file = job.fileStore.getLocalTempFile()
output_file = job.fileStore.writeGlobalFile(indel_file)
job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient'])
Expand Down
3 changes: 3 additions & 0 deletions src/protect/mutation_calling/muse.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options):
+- 'chrM': fsID
:rtype: dict
"""

if muse_options['run'] is False:
return None
# Get a list of chromosomes to handle
if muse_options['chromosomes']:
chromosomes = muse_options['chromosomes']
Expand Down
3 changes: 3 additions & 0 deletions src/protect/mutation_calling/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
+- 'chrM': fsID
:rtype: dict
"""

if mutect_options['run'] is False:
return None
# Get a list of chromosomes to handle
if mutect_options['chromosomes']:
chromosomes = mutect_options['chromosomes']
Expand Down
2 changes: 2 additions & 0 deletions src/protect/mutation_calling/radia.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options):
+- 'chrM': fsID
:rtype: dict
"""
if radia_options['run'] is False:
return None
if 'rna_genome' in rna_bam.keys():
rna_bam = rna_bam['rna_genome']
elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}:
Expand Down
3 changes: 3 additions & 0 deletions src/protect/mutation_calling/somaticsniper.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op
+- 'chrM': fsID
:rtype: toil.fileStore.FileID|dict
"""

if somaticsniper_options['run'] is False:
return None
# Get a list of chromosomes to handle
if somaticsniper_options['chromosomes']:
chromosomes = somaticsniper_options['chromosomes']
Expand Down
20 changes: 16 additions & 4 deletions src/protect/mutation_calling/strelka.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt
:param dict univ_options: Dict of universal options used by almost all tools
:param dict strelka_options: Options specific to strelka
:return: fsID to the merged strelka calls
:rtype: toil.fileStore.FileID
:rtype: dict(str, toil.fileStore.FileID)
|-'snvs': fsID
|
+-'indels': fsID
"""
spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options,
strelka_options, split=False).encapsulate()
Expand All @@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
:param dict univ_options: Dict of universal options used by almost all tools
:param dict strelka_options: Options specific to strelka
:param bool split: Should the results be split into perchrom vcfs?
:return: Either the fsID to the genome-level vcf or a dict of results from running strelka
on every chromosome
:return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the
whole genome
perchrom_strelka:
|- 'chr1':
| |-'snvs': fsID
Expand All @@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
+- 'chrM':
|-'snvs': fsID
+-'indels': fsID
:rtype: toil.fileStore.FileID|dict
genome_strelka:
|-'snvs': fsID
|
+-'indels': fsID
:rtype: dict
"""
if strelka_options['run'] is False:
return {'snvs': None, 'indels': None}
if strelka_options['chromosomes']:
chromosomes = strelka_options['chromosomes']
else:
Expand Down
8 changes: 6 additions & 2 deletions src/protect/pipeline/ProTECT.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def _parse_config_file(job, config_file, max_cores=None):
input_config = _add_default_entries(input_config, protect_defaults)

# Flags to check for presence of encryption keys if required
gdc_inputs = ssec_encrypted = False
gdc_inputs = ssec_encrypted = sample_without_variants = False
for key in input_config.keys():
if key == 'patients':
# Ensure each patient contains the required entries
Expand All @@ -373,6 +373,8 @@ def _parse_config_file(job, config_file, max_cores=None):
raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix '
'sample %s' % sample_name)
gdc_inputs = True
if 'mutation_vcf' not in sample_set[sample_name] and 'fusion_bedpe' not in sample_set[sample_name]:
sample_without_variants = True
else:
# Ensure the required entries exist for this key
_ensure_set_contains(input_config[key], required_keys[key], key)
Expand Down Expand Up @@ -427,6 +429,8 @@ def _parse_config_file(job, config_file, max_cores=None):
'token.')
# Get all the tool inputs
job.fileStore.logToMaster('Obtaining tool inputs')
if sample_without_variants and all(tool_options[k]['run'] is False for k in mutation_caller_list if k not in ('indexes', 'fusion_inspector')):
raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")
process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
mutation_caller_list=mutation_caller_list)
job.fileStore.logToMaster('Obtained tool inputs')
Expand Down Expand Up @@ -670,7 +674,7 @@ def launch_protect(job, patient_data, univ_options, tool_options):
bam_files['normal_dna'].rv(), univ_options,
tool_options['strelka']).encapsulate(),
'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(),
bam_files['normal_dna'].rv(), univ_options, 'indel_options',
bam_files['normal_dna'].rv(), univ_options, {'run': False},
disk='100M', memory='100M', cores=1)}
for sample_type in 'tumor_dna', 'normal_dna':
for caller in mutations:
Expand Down
5 changes: 5 additions & 0 deletions src/protect/pipeline/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,22 @@ mutation_calling:
mutect:
java_Xmx: 2G
version: 1.1.7
run: True
muse:
version: 1.0rc_submission_b391201
run: True
radia:
version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
run: True
somaticsniper:
run: True
version: 1.0.4
samtools:
version: 0.1.8
bam_readcount:
version: 0.7.4
strelka:
run: True
version: 1.0.15

star_fusion:
Expand Down
5 changes: 5 additions & 0 deletions src/protect/pipeline/input_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,21 @@ mutation_calling:
dbsnp_tbi: S3://protect-data/hg38_references/dbsnp_coding.vcf.gz.tbi
mutect:
java_Xmx: 2G
# run: True
# version: 1.1.7
muse:
# run: True
# version: 1.0rc_submission_b391201
radia:
cosmic_beds: S3://protect-data/hg38_references/radia_cosmic.tar.gz
dbsnp_beds: S3://protect-data/hg38_references/radia_dbsnp.tar.gz
retrogene_beds: S3://protect-data/hg38_references/radia_retrogenes.tar.gz
pseudogene_beds: S3://protect-data/hg38_references/radia_pseudogenes.tar.gz
gencode_beds: S3://protect-data/hg38_references/radia_gencode.tar.gz
# run: True
# version: 398366ef07b5911d8082ed61cbf03d487a41f286
somaticsniper:
# run: True
# version: 1.0.4
samtools:
# version: 0.1.8
Expand All @@ -118,6 +122,7 @@ mutation_calling:
#run_trinity: True
#version: 1.0.1
strelka:
# run: True
# version: 1.0.15
config_file: S3://protect-data/hg38_references/strelka_bwa_WXS_config.ini.tar.gz

Expand Down

0 comments on commit f1e1f69

Please sign in to comment.