BD2KGenomics · davidstew · May 15, 2018 · May 18, 2018
diff --git a/MANUAL.md b/MANUAL.md
@@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             version: 1.2.0
 
     mutation_calling:
+        consensus:
+            indel_majority:                                   -> Number of callers required to accept an indel.
+                                                                 Will dynamically compute the indel majority by default.
+            snv_majority:                                     -> Number of callers required to accept an snv.
+                                                                 Will dynamically compute the snv majority by default.
         indexes:
             chromsosomes: canonical, canonical_chr, chr1, Y       -> A list of chromosomes to process.
                                                                      This options overrides the
@@ -362,8 +367,10 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             java_Xmx: 5G                                          -> The heap size to use for MuTect
                                                                      per job (i.e. per chromosome)
             version: 1.1.7
+            run: True                                             -> Switch to skip calling mutect
         muse:
             version: 1.0rc_submission_b391201
+            run: True                                             -> Switch to skip calling muse
         radia:                                                -> Radia uses perchrom bed files in
                                                                  folders as references.
             cosmic_beds: /path/to/radia_cosmic.tar.gz
@@ -372,17 +379,20 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             pseudogene_beds: /path/to/radia_pseudogenes.tar.gz
             gencode_beds: /path/to/radia_gencode.tar.gz
             version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
+            run: True                                             -> Switch to skip calling radia
         somaticsniper:
             version: 1.0.4
             samtools:                                           -> pileup reads
                 version: 0.1.8
             bam_readcount:                                      -> obtain readcounts
                 version: 0.7.4
+            run: True                                            -> Switch to skip calling somaticsniper
         strelka:
             version: 1.0.15
             config_file: /path/to/strelka_config.ini.tar.gz       -> The Strelka config file for a
                                                                      bwa run (modified for a WXS run
                                                                      if necessary)
+            run: True                                           -> Switch to skip calling strelka
         star_fusion:
             run: True                                           -> Switch to skip fusion calling
             version: 1.0.0

diff --git a/src/protect/common.py b/src/protect/common.py
@@ -638,7 +638,7 @@ def email_report(job, univ_options):
         server = smtplib.SMTP('localhost')
     except socket.error as e:
         if e.errno == 111:
-            print('No mail utils on this maachine')
+            print('No mail utils on this machine')
         else:
             print('Unexpected error while attempting to send an email.')
         print('Could not send email report')

diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py
@@ -39,14 +39,15 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):
     :return: fsID for the snpeffed vcf
     :rtype: toil.fileStore.FileID
     """
+    if merged_mutation_file is None:
+        return None
     work_dir = os.getcwd()
     input_files = {
         'merged_mutations.vcf': merged_mutation_file,
         'snpeff_index.tar.gz': snpeff_options['index']}
     input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
     input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir)
     input_files = {key: docker_path(path) for key, path in input_files.items()}
-
     parameters = ['eff',
                   '-dataDir', input_files['snpeff_index'],
                   '-c', '/'.join([input_files['snpeff_index'],

diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py
@@ -51,34 +51,54 @@ def chromosomes_from_fai(genome_fai):
     return chromosomes
 
 
-def run_mutation_aggregator(job, mutation_results, univ_options):
+def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options):
     """
     Aggregate all the called mutations.
 
     :param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome
            format
     :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict consensus_options: options specific for consensus mutation calling
     :returns: fsID for the merged mutations file
     :rtype: toil.fileStore.FileID
     """
     # Setup an input data structure for the merge function
     out = {}
-    for chrom in mutation_results['mutect'].keys():
-        out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
-                                       univ_options).rv()
-    merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
-    job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
-    return merged_snvs.rv()
+    chroms = {}
+    # Extract the chromosomes from a mutation caller if at least one mutation caller is selected.  All callers should
+    # have the same chromosomes.
+    for caller in mutation_results:
+        if mutation_results[caller] is None:
+            continue
+        else:
+            if caller == 'strelka':
+                if mutation_results['strelka']['snvs'] is None:
+                    continue
+                chroms = mutation_results['strelka']['snvs'].keys()
+            else:
+                chroms = mutation_results[caller].keys()
+            break
+    if chroms:
+        for chrom in chroms:
+            out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
+                                           univ_options, consensus_options).rv()
+        merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
+        job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
+        return merged_snvs.rv()
+    else:
+        return None
+
 
 
-def merge_perchrom_mutations(job, chrom, mutations, univ_options):
+def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options):
     """
     Merge the mutation calls for a single chromosome.
 
     :param str chrom: Chromosome to process
     :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of
            per chromosome job store ids for vcfs as value
     :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict consensus_options: options specific for consensus mutation calling
     :returns fsID for vcf contaning merged calls for the given chromosome
     :rtype: toil.fileStore.FileID
     """
@@ -91,39 +111,40 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
     mutations.pop('indels')
     mutations['strelka_indels'] = mutations['strelka']['indels']
     mutations['strelka_snvs'] = mutations['strelka']['snvs']
-    vcf_processor = {'snvs': {'mutect': process_mutect_vcf,
+    vcf_processor = {'snv': {'mutect': process_mutect_vcf,
                               'muse': process_muse_vcf,
                               'radia': process_radia_vcf,
                               'somaticsniper': process_somaticsniper_vcf,
                               'strelka_snvs': process_strelka_vcf
                               },
-                     'indels': {'strelka_indels': process_strelka_vcf
+                     'indel': {'strelka_indels': process_strelka_vcf
                                 }
                      }
-    #                 'fusions': lambda x: None,
-    #                 'indels': lambda x: None}
-    # For now, let's just say 2 out of n need to call it.
-    # num_preds = len(mutations)
-    # majority = int((num_preds + 0.5) / 2)
-    majority = {'snvs': 2,
-                'indels': 1}
-
     accepted_hits = defaultdict(dict)
-
     for mut_type in vcf_processor.keys():
         # Get input files
         perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom],
                                                                       work_dir, univ_options)
-                              for caller in vcf_processor[mut_type]}
+                              for caller in vcf_processor[mut_type]
+                              if mutations[caller] is not None}
+        if not perchrom_mutations:
+            continue
         # Process the strelka key
-        perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
-        perchrom_mutations.pop('strelka_' + mut_type)
+        if 'strelka_' + mut_type in perchrom_mutations:
+            perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
+            perchrom_mutations.pop('strelka_' + mut_type)
+        if consensus_options[mut_type + '_majority'] is not None:
+            majority = consensus_options[mut_type + '_majority']
+        elif len(perchrom_mutations) <= 2:
+            majority = 1
+        else:
+            majority = (len(perchrom_mutations) + 1) / 2
         # Read in each file to a dict
         vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
         all_positions = list(set(itertools.chain(*vcf_lists.values())))
         for position in sorted(all_positions):
             hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()}
-            if sum(hits.values()) >= majority[mut_type]:
+            if sum(hits.values()) >= majority:
                 callers = ','.join([caller for caller, hit in hits.items() if hit])
                 assert position[1] not in accepted_hits[position[0]]
                 accepted_hits[position[0]][position[1]] = (position[2], position[3], callers)
@@ -157,7 +178,7 @@ def read_vcf(vcf_file):
             if line.startswith('#'):
                 continue
             line = line.strip().split()
-            vcf_dict.append((line[0], line[1], line[3], line[4]))
+            vcf_dict.append((line[0], line[1], line[3].upper(), line[4].upper()))
     return vcf_dict
 
 

diff --git a/src/protect/mutation_calling/indel.py b/src/protect/mutation_calling/indel.py
@@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options):
     :return: fsID to the merged fusion calls
     :rtype: toil.fileStore.FileID
     """
-    job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.')
+    job.fileStore.logToMaster('INDELs currently occur only through strelka.')
+    if indel_options['run'] is False:
+        return None
     indel_file = job.fileStore.getLocalTempFile()
     output_file = job.fileStore.writeGlobalFile(indel_file)
     job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient'])

diff --git a/src/protect/mutation_calling/muse.py b/src/protect/mutation_calling/muse.py
@@ -78,6 +78,8 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if muse_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if muse_options['chromosomes']:
         chromosomes = muse_options['chromosomes']

diff --git a/src/protect/mutation_calling/mutect.py b/src/protect/mutation_calling/mutect.py
@@ -75,6 +75,8 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if mutect_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if mutect_options['chromosomes']:
         chromosomes = mutect_options['chromosomes']

diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py
@@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if radia_options['run'] is False:
+        return None
     if 'rna_genome' in rna_bam.keys():
         rna_bam = rna_bam['rna_genome']
     elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}:

diff --git a/src/protect/mutation_calling/somaticsniper.py b/src/protect/mutation_calling/somaticsniper.py
@@ -85,6 +85,8 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op
                  +- 'chrM': fsID
     :rtype: toil.fileStore.FileID|dict
     """
+    if somaticsniper_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if somaticsniper_options['chromosomes']:
         chromosomes = somaticsniper_options['chromosomes']

diff --git a/src/protect/mutation_calling/strelka.py b/src/protect/mutation_calling/strelka.py
@@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt
     :param dict univ_options: Dict of universal options used by almost all tools
     :param dict strelka_options: Options specific to strelka
     :return: fsID to the merged strelka calls
-    :rtype: toil.fileStore.FileID
+    :rtype: dict(str, toil.fileStore.FileID)
+              |-'snvs': fsID
+              |
+              +-'indels': fsID
     """
     spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options,
                           strelka_options, split=False).encapsulate()
@@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
     :param dict univ_options: Dict of universal options used by almost all tools
     :param dict strelka_options: Options specific to strelka
     :param bool split: Should the results be split into perchrom vcfs?
-    :return: Either the fsID to the genome-level vcf or a dict of results from running strelka
-             on every chromosome
+    :return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the
+             whole genome
+
              perchrom_strelka:
                  |- 'chr1':
                  |      |-'snvs': fsID
@@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
                  +- 'chrM':
                         |-'snvs': fsID
                         +-'indels': fsID
-    :rtype: toil.fileStore.FileID|dict
+
+             genome_strelka:
+                 |-'snvs': fsID
+                 |
+                 +-'indels': fsID
+
+    :rtype: dict
     """
+    if strelka_options['run'] is False:
+            return {'snvs': None, 'indels': None}
     if strelka_options['chromosomes']:
         chromosomes = strelka_options['chromosomes']
     else:

diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py
@@ -354,6 +354,8 @@ def _parse_config_file(job, config_file, max_cores=None):
 
     # Flags to check for presence of encryption keys if required
     gdc_inputs = ssec_encrypted = False
+    # Flag to check if a sample without an input vcf/bedpe was provided
+    sample_without_variants = False
     for key in input_config.keys():
         if key == 'patients':
             # Ensure each patient contains the required entries
@@ -373,6 +375,9 @@ def _parse_config_file(job, config_file, max_cores=None):
                         raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix '
                                              'sample %s' % sample_name)
                     gdc_inputs = True
+                if ('mutation_vcf' not in sample_set[sample_name]
+                        and 'fusion_bedpe' not in sample_set[sample_name]):
+                    sample_without_variants = True
         else:
             # Ensure the required entries exist for this key
             _ensure_set_contains(input_config[key], required_keys[key], key)
@@ -427,6 +432,27 @@ def _parse_config_file(job, config_file, max_cores=None):
                                                                 'token.')
     # Get all the tool inputs
     job.fileStore.logToMaster('Obtaining tool inputs')
+    if (sample_without_variants and all(tool_options[k]['run'] is False
+                                        for k in mutation_caller_list
+                                        if k not in ('indexes', 'fusion_inspector', 'consensus'))):
+        raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")
+
+    for mut_type in ('snv', 'indel'):
+        if tool_options['consensus'][mut_type + '_majority'] is not None:
+            if not isinstance(tool_options['consensus'][mut_type + '_majority'], int):
+                raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' %
+                                   (tool_options['consensus'][mut_type + '_majority'], mut_type))
+            if mut_type == 'snv':
+                count = sum(tool_options[k]['run'] is True
+                            for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka'))
+            else:
+                count = 1 if tool_options['strelka']['run'] is True else 0
+            if tool_options['consensus'][mut_type + '_majority'] > count:
+                raise RuntimeError('Majority cannot be greater than the number of callers. '
+                                   'Got number of %s callers = %s majority = %s.' %
+                                   (mut_type, count,
+                                    tool_options['consensus'][mut_type + '_majority']))
+
     process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
                                             mutation_caller_list=mutation_caller_list)
     job.fileStore.logToMaster('Obtained tool inputs')
@@ -670,15 +696,15 @@ def launch_protect(job, patient_data, univ_options, tool_options):
                                      bam_files['normal_dna'].rv(), univ_options,
                                      tool_options['strelka']).encapsulate(),
             'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(),
-                                    bam_files['normal_dna'].rv(), univ_options, 'indel_options',
+                                    bam_files['normal_dna'].rv(), univ_options, {'run': False},
                                     disk='100M', memory='100M', cores=1)}
         for sample_type in 'tumor_dna', 'normal_dna':
             for caller in mutations:
                 bam_files[sample_type].addChild(mutations[caller])
         bam_files['tumor_rna'].addChild(mutations['radia'])
         get_mutations = job.wrapJobFn(run_mutation_aggregator,
                                       {caller: cjob.rv() for caller, cjob in mutations.items()},
-                                      univ_options, disk='100M', memory='100M',
+                                      univ_options, tool_options['consensus'], disk='100M', memory='100M',
                                       cores=1).encapsulate()
         for caller in mutations:
             mutations[caller].addChild(get_mutations)
@@ -768,7 +794,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None):
         indexes = tools.pop('indexes')
         indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes'])
         for mutation_caller in mutation_caller_list:
-            if mutation_caller == 'indexes':
+            if mutation_caller in ('indexes', 'consensus'):
                 continue
             tools[mutation_caller].update(indexes)
     return tools

diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml
@@ -55,22 +55,30 @@ expression_estimation:
         version: 1.2.20
 
 mutation_calling:
+    consensus:
+        indel_majority:
+        snv_majority:
     indexes:
         chromosomes:
     mutect:
         java_Xmx: 2G
         version: 1.1.7
+        run: True
     muse:
         version: 1.0rc_submission_b391201
+        run: True
     radia:
         version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
+        run: True
     somaticsniper:
+        run: True
         version: 1.0.4
         samtools:
             version: 0.1.8
         bam_readcount:
             version: 0.7.4
     strelka:
+        run: True
         version: 1.0.15
 
     star_fusion: