Skip to content

Commit

Permalink
Make majority a user provided option (resolves #286)
Browse files Browse the repository at this point in the history
resolves #286

Added snv_majority and indel_majority in a consensus tab under mutation_calling
in input_parameters.yaml. If values are not provided, the majority will be dynamically
figured out.
  • Loading branch information
davidstew committed Jun 5, 2018
1 parent 1945252 commit b018a59
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 11 deletions.
5 changes: 5 additions & 0 deletions MANUAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i
version: 1.2.0

mutation_calling:
consensus:
indel_majority: -> Number of callers required to accept an indel.
Will dynamically compute the indel majority by default.
snv_majority: -> Number of callers required to accept an snv.
Will dynamically compute the snv majority by default.
indexes:
chromsosomes: canonical, canonical_chr, chr1, Y -> A list of chromosomes to process.
This options overrides the
Expand Down
2 changes: 1 addition & 1 deletion src/protect/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def email_report(job, univ_options):
server = smtplib.SMTP('localhost')
except socket.error as e:
if e.errno == 111:
print('No mail utils on this maachine')
print('No mail utils on this machine')
else:
print('Unexpected error while attempting to send an email.')
print('Could not send email report')
Expand Down
21 changes: 14 additions & 7 deletions src/protect/mutation_calling/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,14 @@ def chromosomes_from_fai(genome_fai):
return chromosomes


def run_mutation_aggregator(job, mutation_results, univ_options):
def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options):
"""
Aggregate all the called mutations.
:param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome
format
:param dict univ_options: Dict of universal options used by almost all tools
:param dict consensus_options: options specific for consensus mutation calling
:returns: fsID for the merged mutations file
:rtype: toil.fileStore.FileID
"""
Expand All @@ -80,7 +81,7 @@ def run_mutation_aggregator(job, mutation_results, univ_options):
if chroms:
for chrom in chroms:
out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
univ_options).rv()
univ_options, consensus_options).rv()
merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
return merged_snvs.rv()
Expand All @@ -89,14 +90,15 @@ def run_mutation_aggregator(job, mutation_results, univ_options):



def merge_perchrom_mutations(job, chrom, mutations, univ_options):
def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options):
"""
Merge the mutation calls for a single chromosome.
:param str chrom: Chromosome to process
:param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of
per chromosome job store ids for vcfs as value
:param dict univ_options: Dict of universal options used by almost all tools
:param dict consensus_options: options specific for consensus mutation calling
:returns fsID for vcf contaning merged calls for the given chromosome
:rtype: toil.fileStore.FileID
"""
Expand All @@ -109,13 +111,13 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
mutations.pop('indels')
mutations['strelka_indels'] = mutations['strelka']['indels']
mutations['strelka_snvs'] = mutations['strelka']['snvs']
vcf_processor = {'snvs': {'mutect': process_mutect_vcf,
vcf_processor = {'snv': {'mutect': process_mutect_vcf,
'muse': process_muse_vcf,
'radia': process_radia_vcf,
'somaticsniper': process_somaticsniper_vcf,
'strelka_snvs': process_strelka_vcf
},
'indels': {'strelka_indels': process_strelka_vcf
'indel': {'strelka_indels': process_strelka_vcf
}
}
accepted_hits = defaultdict(dict)
Expand All @@ -131,7 +133,12 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
if 'strelka_' + mut_type in perchrom_mutations:
perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
perchrom_mutations.pop('strelka_' + mut_type)
majority = 1 if len(perchrom_mutations) <= 2 else (len(perchrom_mutations) + 1) / 2
if consensus_options[mut_type + '_majority'] is not None:
majority = consensus_options[mut_type + '_majority']
elif len(perchrom_mutations) <= 2:
majority = 1
else:
majority = (len(perchrom_mutations) + 1) / 2
# Read in each file to a dict
vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
all_positions = list(set(itertools.chain(*vcf_lists.values())))
Expand Down Expand Up @@ -171,7 +178,7 @@ def read_vcf(vcf_file):
if line.startswith('#'):
continue
line = line.strip().split()
vcf_dict.append((line[0], line[1], line[3], line[4]))
vcf_dict.append((line[0], line[1], line[3].upper(), line[4].upper()))
return vcf_dict


Expand Down
23 changes: 20 additions & 3 deletions src/protect/pipeline/ProTECT.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,25 @@ def _parse_config_file(job, config_file, max_cores=None):
job.fileStore.logToMaster('Obtaining tool inputs')
if (sample_without_variants and all(tool_options[k]['run'] is False
for k in mutation_caller_list
if k not in ('indexes', 'fusion_inspector'))):
if k not in ('indexes', 'fusion_inspector', 'consensus'))):
raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")

for mut_type in ('snv', 'indel'):
if tool_options['consensus'][mut_type + '_majority'] is not None:
if not isinstance(tool_options['consensus'][mut_type + '_majority'], int):
raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' %
(tool_options['consensus'][mut_type + '_majority'], mut_type))
if mut_type == 'snv':
count = sum(tool_options[k]['run'] is True
for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka'))
else:
count = 1 if tool_options['strelka']['run'] is True else 0
if tool_options['consensus'][mut_type + '_majority'] > count:
raise RuntimeError('Majority cannot be greater than the number of callers. '
'Got number of %s callers = %s majority = %s.' %
(mut_type, count,
tool_options['consensus'][mut_type + '_majority']))

process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
mutation_caller_list=mutation_caller_list)
job.fileStore.logToMaster('Obtained tool inputs')
Expand Down Expand Up @@ -687,7 +704,7 @@ def launch_protect(job, patient_data, univ_options, tool_options):
bam_files['tumor_rna'].addChild(mutations['radia'])
get_mutations = job.wrapJobFn(run_mutation_aggregator,
{caller: cjob.rv() for caller, cjob in mutations.items()},
univ_options, disk='100M', memory='100M',
univ_options, tool_options['consensus'], disk='100M', memory='100M',
cores=1).encapsulate()
for caller in mutations:
mutations[caller].addChild(get_mutations)
Expand Down Expand Up @@ -777,7 +794,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None):
indexes = tools.pop('indexes')
indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes'])
for mutation_caller in mutation_caller_list:
if mutation_caller == 'indexes':
if mutation_caller in ('indexes', 'consensus'):
continue
tools[mutation_caller].update(indexes)
return tools
Expand Down
3 changes: 3 additions & 0 deletions src/protect/pipeline/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ expression_estimation:
version: 1.2.20

mutation_calling:
consensus:
indel_majority:
snv_majority:
indexes:
chromosomes:
mutect:
Expand Down
3 changes: 3 additions & 0 deletions src/protect/pipeline/input_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ expression_estimation:
# version: 1.2.0

mutation_calling:
consensus:
indel_majority: 1
snv_majority: 2
indexes:
chromosomes: canonical_chr, chrM
genome_fasta: S3://protect-data/hg38_references/hg38.fa.tar.gz
Expand Down

0 comments on commit b018a59

Please sign in to comment.