Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mix and match functionality for mutation calling (resolves #265) #283

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions MANUAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i
version: 1.2.0

mutation_calling:
consensus:
indel_majority: -> Number of callers required to accept an indel.
Will dynamically compute the indel majority by default.
snv_majority: -> Number of callers required to accept an snv.
Will dynamically compute the snv majority by default.
indexes:
chromsosomes: canonical, canonical_chr, chr1, Y -> A list of chromosomes to process.
This options overrides the
Expand Down Expand Up @@ -362,8 +367,10 @@ be substituted with S3 links. Descriptions for creating all files can be found i
java_Xmx: 5G -> The heap size to use for MuTect
per job (i.e. per chromosome)
version: 1.1.7
run: True -> Switch to skip calling mutect
muse:
version: 1.0rc_submission_b391201
run: True -> Switch to skip calling muse
radia: -> Radia uses perchrom bed files in
folders as references.
cosmic_beds: /path/to/radia_cosmic.tar.gz
Expand All @@ -372,17 +379,20 @@ be substituted with S3 links. Descriptions for creating all files can be found i
pseudogene_beds: /path/to/radia_pseudogenes.tar.gz
gencode_beds: /path/to/radia_gencode.tar.gz
version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
run: True -> Switch to skip calling radia
somaticsniper:
version: 1.0.4
samtools: -> pileup reads
version: 0.1.8
bam_readcount: -> obtain readcounts
version: 0.7.4
run: True -> Switch to skip calling somaticsniper
strelka:
version: 1.0.15
config_file: /path/to/strelka_config.ini.tar.gz -> The Strelka config file for a
bwa run (modified for a WXS run
if necessary)
run: True -> Switch to skip calling strelka
star_fusion:
run: True -> Switch to skip fusion calling
version: 1.0.0
Expand Down
2 changes: 1 addition & 1 deletion src/protect/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def email_report(job, univ_options):
server = smtplib.SMTP('localhost')
except socket.error as e:
if e.errno == 111:
print('No mail utils on this maachine')
print('No mail utils on this machine')
else:
print('Unexpected error while attempting to send an email.')
print('Could not send email report')
Expand Down
3 changes: 2 additions & 1 deletion src/protect/mutation_annotation/snpeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):
:return: fsID for the snpeffed vcf
:rtype: toil.fileStore.FileID
"""
if merged_mutation_file is None:
return None
work_dir = os.getcwd()
input_files = {
'merged_mutations.vcf': merged_mutation_file,
'snpeff_index.tar.gz': snpeff_options['index']}
input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir)
input_files = {key: docker_path(path) for key, path in input_files.items()}

parameters = ['eff',
'-dataDir', input_files['snpeff_index'],
'-c', '/'.join([input_files['snpeff_index'],
Expand Down
69 changes: 45 additions & 24 deletions src/protect/mutation_calling/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,34 +51,54 @@ def chromosomes_from_fai(genome_fai):
return chromosomes


def run_mutation_aggregator(job, mutation_results, univ_options):
def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options):
"""
Aggregate all the called mutations.

:param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome
format
:param dict univ_options: Dict of universal options used by almost all tools
:param dict consensus_options: options specific for consensus mutation calling
:returns: fsID for the merged mutations file
:rtype: toil.fileStore.FileID
"""
# Setup an input data structure for the merge function
out = {}
for chrom in mutation_results['mutect'].keys():
out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
univ_options).rv()
merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
return merged_snvs.rv()
chroms = {}
# Extract the chromosomes from a mutation caller if at least one mutation caller is selected. All callers should
# have the same chromosomes.
for caller in mutation_results:
if mutation_results[caller] is None:
continue
else:
if caller == 'strelka':
if mutation_results['strelka']['snvs'] is None:
continue
chroms = mutation_results['strelka']['snvs'].keys()
else:
chroms = mutation_results[caller].keys()
break
if chroms:
for chrom in chroms:
out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
univ_options, consensus_options).rv()
merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
return merged_snvs.rv()
else:
return None



def merge_perchrom_mutations(job, chrom, mutations, univ_options):
def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options):
"""
Merge the mutation calls for a single chromosome.

:param str chrom: Chromosome to process
:param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of
per chromosome job store ids for vcfs as value
:param dict univ_options: Dict of universal options used by almost all tools
:param dict consensus_options: options specific for consensus mutation calling
:returns fsID for vcf contaning merged calls for the given chromosome
:rtype: toil.fileStore.FileID
"""
Expand All @@ -91,39 +111,40 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
mutations.pop('indels')
mutations['strelka_indels'] = mutations['strelka']['indels']
mutations['strelka_snvs'] = mutations['strelka']['snvs']
vcf_processor = {'snvs': {'mutect': process_mutect_vcf,
vcf_processor = {'snv': {'mutect': process_mutect_vcf,
'muse': process_muse_vcf,
'radia': process_radia_vcf,
'somaticsniper': process_somaticsniper_vcf,
'strelka_snvs': process_strelka_vcf
},
'indels': {'strelka_indels': process_strelka_vcf
'indel': {'strelka_indels': process_strelka_vcf
}
}
# 'fusions': lambda x: None,
# 'indels': lambda x: None}
# For now, let's just say 2 out of n need to call it.
# num_preds = len(mutations)
# majority = int((num_preds + 0.5) / 2)
majority = {'snvs': 2,
'indels': 1}

accepted_hits = defaultdict(dict)

for mut_type in vcf_processor.keys():
# Get input files
perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom],
work_dir, univ_options)
for caller in vcf_processor[mut_type]}
for caller in vcf_processor[mut_type]
if mutations[caller] is not None}
if not perchrom_mutations:
continue
# Process the strelka key
perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
perchrom_mutations.pop('strelka_' + mut_type)
if 'strelka_' + mut_type in perchrom_mutations:
perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
perchrom_mutations.pop('strelka_' + mut_type)
if consensus_options[mut_type + '_majority'] is not None:
majority = consensus_options[mut_type + '_majority']
elif len(perchrom_mutations) <= 2:
majority = 1
else:
majority = (len(perchrom_mutations) + 1) / 2
# Read in each file to a dict
vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
all_positions = list(set(itertools.chain(*vcf_lists.values())))
for position in sorted(all_positions):
hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()}
if sum(hits.values()) >= majority[mut_type]:
if sum(hits.values()) >= majority:
callers = ','.join([caller for caller, hit in hits.items() if hit])
assert position[1] not in accepted_hits[position[0]]
accepted_hits[position[0]][position[1]] = (position[2], position[3], callers)
Expand Down Expand Up @@ -157,7 +178,7 @@ def read_vcf(vcf_file):
if line.startswith('#'):
continue
line = line.strip().split()
vcf_dict.append((line[0], line[1], line[3], line[4]))
vcf_dict.append((line[0], line[1], line[3].upper(), line[4].upper()))
return vcf_dict


Expand Down
4 changes: 3 additions & 1 deletion src/protect/mutation_calling/indel.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options):
:return: fsID to the merged fusion calls
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.')
job.fileStore.logToMaster('INDELs currently occur only through strelka.')
if indel_options['run'] is False:
return None
indel_file = job.fileStore.getLocalTempFile()
output_file = job.fileStore.writeGlobalFile(indel_file)
job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient'])
Expand Down
2 changes: 2 additions & 0 deletions src/protect/mutation_calling/muse.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options):
+- 'chrM': fsID
:rtype: dict
"""
if muse_options['run'] is False:
return None
# Get a list of chromosomes to handle
if muse_options['chromosomes']:
chromosomes = muse_options['chromosomes']
Expand Down
2 changes: 2 additions & 0 deletions src/protect/mutation_calling/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
+- 'chrM': fsID
:rtype: dict
"""
if mutect_options['run'] is False:
return None
# Get a list of chromosomes to handle
if mutect_options['chromosomes']:
chromosomes = mutect_options['chromosomes']
Expand Down
2 changes: 2 additions & 0 deletions src/protect/mutation_calling/radia.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options):
+- 'chrM': fsID
:rtype: dict
"""
if radia_options['run'] is False:
return None
if 'rna_genome' in rna_bam.keys():
rna_bam = rna_bam['rna_genome']
elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}:
Expand Down
2 changes: 2 additions & 0 deletions src/protect/mutation_calling/somaticsniper.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op
+- 'chrM': fsID
:rtype: toil.fileStore.FileID|dict
"""
if somaticsniper_options['run'] is False:
return None
# Get a list of chromosomes to handle
if somaticsniper_options['chromosomes']:
chromosomes = somaticsniper_options['chromosomes']
Expand Down
20 changes: 16 additions & 4 deletions src/protect/mutation_calling/strelka.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt
:param dict univ_options: Dict of universal options used by almost all tools
:param dict strelka_options: Options specific to strelka
:return: fsID to the merged strelka calls
:rtype: toil.fileStore.FileID
:rtype: dict(str, toil.fileStore.FileID)
|-'snvs': fsID
|
+-'indels': fsID
"""
spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options,
strelka_options, split=False).encapsulate()
Expand All @@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
:param dict univ_options: Dict of universal options used by almost all tools
:param dict strelka_options: Options specific to strelka
:param bool split: Should the results be split into perchrom vcfs?
:return: Either the fsID to the genome-level vcf or a dict of results from running strelka
on every chromosome
:return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the
whole genome

perchrom_strelka:
|- 'chr1':
| |-'snvs': fsID
Expand All @@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
+- 'chrM':
|-'snvs': fsID
+-'indels': fsID
:rtype: toil.fileStore.FileID|dict

genome_strelka:
|-'snvs': fsID
|
+-'indels': fsID

:rtype: dict
"""
if strelka_options['run'] is False:
return {'snvs': None, 'indels': None}
if strelka_options['chromosomes']:
chromosomes = strelka_options['chromosomes']
else:
Expand Down
32 changes: 29 additions & 3 deletions src/protect/pipeline/ProTECT.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ def _parse_config_file(job, config_file, max_cores=None):

# Flags to check for presence of encryption keys if required
gdc_inputs = ssec_encrypted = False
# Flag to check if a sample without an input vcf/bedpe was provided
sample_without_variants = False
for key in input_config.keys():
if key == 'patients':
# Ensure each patient contains the required entries
Expand All @@ -373,6 +375,9 @@ def _parse_config_file(job, config_file, max_cores=None):
raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix '
'sample %s' % sample_name)
gdc_inputs = True
if ('mutation_vcf' not in sample_set[sample_name]
and 'fusion_bedpe' not in sample_set[sample_name]):
sample_without_variants = True
else:
# Ensure the required entries exist for this key
_ensure_set_contains(input_config[key], required_keys[key], key)
Expand Down Expand Up @@ -427,6 +432,27 @@ def _parse_config_file(job, config_file, max_cores=None):
'token.')
# Get all the tool inputs
job.fileStore.logToMaster('Obtaining tool inputs')
if (sample_without_variants and all(tool_options[k]['run'] is False
for k in mutation_caller_list
if k not in ('indexes', 'fusion_inspector', 'consensus'))):
raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")

for mut_type in ('snv', 'indel'):
if tool_options['consensus'][mut_type + '_majority'] is not None:
if not isinstance(tool_options['consensus'][mut_type + '_majority'], int):
raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' %
(tool_options['consensus'][mut_type + '_majority'], mut_type))
if mut_type == 'snv':
count = sum(tool_options[k]['run'] is True
for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka'))
else:
count = 1 if tool_options['strelka']['run'] is True else 0
if tool_options['consensus'][mut_type + '_majority'] > count:
raise RuntimeError('Majority cannot be greater than the number of callers. '
'Got number of %s callers = %s majority = %s.' %
(mut_type, count,
tool_options['consensus'][mut_type + '_majority']))

process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
mutation_caller_list=mutation_caller_list)
job.fileStore.logToMaster('Obtained tool inputs')
Expand Down Expand Up @@ -670,15 +696,15 @@ def launch_protect(job, patient_data, univ_options, tool_options):
bam_files['normal_dna'].rv(), univ_options,
tool_options['strelka']).encapsulate(),
'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(),
bam_files['normal_dna'].rv(), univ_options, 'indel_options',
bam_files['normal_dna'].rv(), univ_options, {'run': False},
disk='100M', memory='100M', cores=1)}
for sample_type in 'tumor_dna', 'normal_dna':
for caller in mutations:
bam_files[sample_type].addChild(mutations[caller])
bam_files['tumor_rna'].addChild(mutations['radia'])
get_mutations = job.wrapJobFn(run_mutation_aggregator,
{caller: cjob.rv() for caller, cjob in mutations.items()},
univ_options, disk='100M', memory='100M',
univ_options, tool_options['consensus'], disk='100M', memory='100M',
cores=1).encapsulate()
for caller in mutations:
mutations[caller].addChild(get_mutations)
Expand Down Expand Up @@ -768,7 +794,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None):
indexes = tools.pop('indexes')
indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes'])
for mutation_caller in mutation_caller_list:
if mutation_caller == 'indexes':
if mutation_caller in ('indexes', 'consensus'):
continue
tools[mutation_caller].update(indexes)
return tools
Expand Down
8 changes: 8 additions & 0 deletions src/protect/pipeline/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,30 @@ expression_estimation:
version: 1.2.20

mutation_calling:
consensus:
indel_majority:
snv_majority:
indexes:
chromosomes:
mutect:
java_Xmx: 2G
version: 1.1.7
run: True
muse:
version: 1.0rc_submission_b391201
run: True
radia:
version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
run: True
somaticsniper:
run: True
version: 1.0.4
samtools:
version: 0.1.8
bam_readcount:
version: 0.7.4
strelka:
run: True
version: 1.0.15

star_fusion:
Expand Down
Loading