From f1e1f69dfab551426da4f3e6f32be0f75b3f7002 Mon Sep 17 00:00:00 2001 From: davidstew Date: Thu, 10 May 2018 17:56:55 -0700 Subject: [PATCH] Add mix and match functionality for mutation calling (resolves #265) --- src/protect/mutation_annotation/snpeff.py | 3 +- src/protect/mutation_calling/common.py | 44 +++++++++++++------ src/protect/mutation_calling/indel.py | 4 +- src/protect/mutation_calling/muse.py | 3 ++ src/protect/mutation_calling/mutect.py | 3 ++ src/protect/mutation_calling/radia.py | 2 + src/protect/mutation_calling/somaticsniper.py | 3 ++ src/protect/mutation_calling/strelka.py | 20 +++++++-- src/protect/pipeline/ProTECT.py | 8 +++- src/protect/pipeline/defaults.yaml | 5 +++ src/protect/pipeline/input_parameters.yaml | 5 +++ 11 files changed, 79 insertions(+), 21 deletions(-) diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py index 43ed03c0..e3665370 100644 --- a/src/protect/mutation_annotation/snpeff.py +++ b/src/protect/mutation_annotation/snpeff.py @@ -39,6 +39,8 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): :return: fsID for the snpeffed vcf :rtype: toil.fileStore.FileID """ + if merged_mutation_file is None: + return None work_dir = os.getcwd() input_files = { 'merged_mutations.vcf': merged_mutation_file, @@ -46,7 +48,6 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} - parameters = ['eff', '-dataDir', input_files['snpeff_index'], '-c', '/'.join([input_files['snpeff_index'], diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py index 947426d2..c9062d3b 100644 --- a/src/protect/mutation_calling/common.py +++ b/src/protect/mutation_calling/common.py @@ -63,12 +63,28 @@ def run_mutation_aggregator(job, mutation_results, univ_options): """ # Setup an input data structure for the merge function out = {} - for chrom in mutation_results['mutect'].keys(): - out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, - univ_options).rv() - merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options) - job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient']) - return merged_snvs.rv() + chroms = {} + for caller in mutation_results: + if mutation_results[caller] is None: + continue + else: + if caller == 'strelka': + if mutation_results['strelka']['snvs'] is None: + continue + chroms = mutation_results['strelka']['snvs'].keys() + else: + chroms = mutation_results[caller].keys() + break + if chroms: + for chrom in chroms: + out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, + univ_options).rv() + merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options) + job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient']) + return merged_snvs.rv() + else: + return None + def merge_perchrom_mutations(job, chrom, mutations, univ_options): @@ -105,25 +121,27 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): # For now, let's just say 2 out of n need to call it. # num_preds = len(mutations) # majority = int((num_preds + 0.5) / 2) - majority = {'snvs': 2, - 'indels': 1} - accepted_hits = defaultdict(dict) for mut_type in vcf_processor.keys(): # Get input files perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom], work_dir, univ_options) - for caller in vcf_processor[mut_type]} + for caller in vcf_processor[mut_type] + if mutations[caller] is not None} # Process the strelka key - perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] - perchrom_mutations.pop('strelka_' + mut_type) + if 'strelka' + mut_type in perchrom_mutations: + perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] + perchrom_mutations.pop('strelka_' + mut_type) + if not perchrom_mutations: + continue + majority = 1 if len(perchrom_mutations) <= 2 else len(perchrom_mutations) / 2 # Read in each file to a dict vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()} all_positions = list(set(itertools.chain(*vcf_lists.values()))) for position in sorted(all_positions): hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()} - if sum(hits.values()) >= majority[mut_type]: + if sum(hits.values()) >= majority: callers = ','.join([caller for caller, hit in hits.items() if hit]) assert position[1] not in accepted_hits[position[0]] accepted_hits[position[0]][position[1]] = (position[2], position[3], callers) diff --git a/src/protect/mutation_calling/indel.py b/src/protect/mutation_calling/indel.py index 7c8990da..81c3da6a 100644 --- a/src/protect/mutation_calling/indel.py +++ b/src/protect/mutation_calling/indel.py @@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options): :return: fsID to the merged fusion calls :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.') + job.fileStore.logToMaster('INDELs currently occur only through strelka.') + if indel_options['run'] is False: + return None indel_file = job.fileStore.getLocalTempFile() output_file = job.fileStore.writeGlobalFile(indel_file) job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient']) diff --git a/src/protect/mutation_calling/muse.py b/src/protect/mutation_calling/muse.py index 28be1c6b..9078a7a3 100644 --- a/src/protect/mutation_calling/muse.py +++ b/src/protect/mutation_calling/muse.py @@ -78,6 +78,9 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options): +- 'chrM': fsID :rtype: dict """ + + if muse_options['run'] is False: + return None # Get a list of chromosomes to handle if muse_options['chromosomes']: chromosomes = muse_options['chromosomes'] diff --git a/src/protect/mutation_calling/mutect.py b/src/protect/mutation_calling/mutect.py index eb1a556f..f79088c4 100644 --- a/src/protect/mutation_calling/mutect.py +++ b/src/protect/mutation_calling/mutect.py @@ -75,6 +75,9 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options): +- 'chrM': fsID :rtype: dict """ + + if mutect_options['run'] is False: + return None # Get a list of chromosomes to handle if mutect_options['chromosomes']: chromosomes = mutect_options['chromosomes'] diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py index 178bd8dd..6d4c95cd 100644 --- a/src/protect/mutation_calling/radia.py +++ b/src/protect/mutation_calling/radia.py @@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): +- 'chrM': fsID :rtype: dict """ + if radia_options['run'] is False: + return None if 'rna_genome' in rna_bam.keys(): rna_bam = rna_bam['rna_genome'] elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}: diff --git a/src/protect/mutation_calling/somaticsniper.py b/src/protect/mutation_calling/somaticsniper.py index ae148f04..6ffbd454 100644 --- a/src/protect/mutation_calling/somaticsniper.py +++ b/src/protect/mutation_calling/somaticsniper.py @@ -85,6 +85,9 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op +- 'chrM': fsID :rtype: toil.fileStore.FileID|dict """ + + if somaticsniper_options['run'] is False: + return None # Get a list of chromosomes to handle if somaticsniper_options['chromosomes']: chromosomes = somaticsniper_options['chromosomes'] diff --git a/src/protect/mutation_calling/strelka.py b/src/protect/mutation_calling/strelka.py index 0a1a0e7b..708b6cb1 100644 --- a/src/protect/mutation_calling/strelka.py +++ b/src/protect/mutation_calling/strelka.py @@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt :param dict univ_options: Dict of universal options used by almost all tools :param dict strelka_options: Options specific to strelka :return: fsID to the merged strelka calls - :rtype: toil.fileStore.FileID + :rtype: dict(str, toil.fileStore.FileID) + |-'snvs': fsID + | + +-'indels': fsID """ spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options, strelka_options, split=False).encapsulate() @@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split :param dict univ_options: Dict of universal options used by almost all tools :param dict strelka_options: Options specific to strelka :param bool split: Should the results be split into perchrom vcfs? - :return: Either the fsID to the genome-level vcf or a dict of results from running strelka - on every chromosome + :return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the + whole genome + perchrom_strelka: |- 'chr1': | |-'snvs': fsID @@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split +- 'chrM': |-'snvs': fsID +-'indels': fsID - :rtype: toil.fileStore.FileID|dict + + genome_strelka: + |-'snvs': fsID + | + +-'indels': fsID + + :rtype: dict """ + if strelka_options['run'] is False: + return {'snvs': None, 'indels': None} if strelka_options['chromosomes']: chromosomes = strelka_options['chromosomes'] else: diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py index 6b40fb79..65da5e5d 100644 --- a/src/protect/pipeline/ProTECT.py +++ b/src/protect/pipeline/ProTECT.py @@ -353,7 +353,7 @@ def _parse_config_file(job, config_file, max_cores=None): input_config = _add_default_entries(input_config, protect_defaults) # Flags to check for presence of encryption keys if required - gdc_inputs = ssec_encrypted = False + gdc_inputs = ssec_encrypted = sample_without_variants = False for key in input_config.keys(): if key == 'patients': # Ensure each patient contains the required entries @@ -373,6 +373,8 @@ def _parse_config_file(job, config_file, max_cores=None): raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix ' 'sample %s' % sample_name) gdc_inputs = True + if 'mutation_vcf' not in sample_set[sample_name] and 'fusion_bedpe' not in sample_set[sample_name]: + sample_without_variants = True else: # Ensure the required entries exist for this key _ensure_set_contains(input_config[key], required_keys[key], key) @@ -427,6 +429,8 @@ def _parse_config_file(job, config_file, max_cores=None): 'token.') # Get all the tool inputs job.fileStore.logToMaster('Obtaining tool inputs') + if sample_without_variants and all(tool_options[k]['run'] is False for k in mutation_caller_list if k not in ('indexes', 'fusion_inspector')): + raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.") process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options, mutation_caller_list=mutation_caller_list) job.fileStore.logToMaster('Obtained tool inputs') @@ -670,7 +674,7 @@ def launch_protect(job, patient_data, univ_options, tool_options): bam_files['normal_dna'].rv(), univ_options, tool_options['strelka']).encapsulate(), 'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(), - bam_files['normal_dna'].rv(), univ_options, 'indel_options', + bam_files['normal_dna'].rv(), univ_options, {'run': False}, disk='100M', memory='100M', cores=1)} for sample_type in 'tumor_dna', 'normal_dna': for caller in mutations: diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml index 145d2fa4..046e043d 100644 --- a/src/protect/pipeline/defaults.yaml +++ b/src/protect/pipeline/defaults.yaml @@ -60,17 +60,22 @@ mutation_calling: mutect: java_Xmx: 2G version: 1.1.7 + run: True muse: version: 1.0rc_submission_b391201 + run: True radia: version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03 + run: True somaticsniper: + run: True version: 1.0.4 samtools: version: 0.1.8 bam_readcount: version: 0.7.4 strelka: + run: True version: 1.0.15 star_fusion: diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml index 5dc5943e..2d6d652e 100644 --- a/src/protect/pipeline/input_parameters.yaml +++ b/src/protect/pipeline/input_parameters.yaml @@ -95,8 +95,10 @@ mutation_calling: dbsnp_tbi: S3://protect-data/hg38_references/dbsnp_coding.vcf.gz.tbi mutect: java_Xmx: 2G + # run: True # version: 1.1.7 muse: + # run: True # version: 1.0rc_submission_b391201 radia: cosmic_beds: S3://protect-data/hg38_references/radia_cosmic.tar.gz @@ -104,8 +106,10 @@ mutation_calling: retrogene_beds: S3://protect-data/hg38_references/radia_retrogenes.tar.gz pseudogene_beds: S3://protect-data/hg38_references/radia_pseudogenes.tar.gz gencode_beds: S3://protect-data/hg38_references/radia_gencode.tar.gz + # run: True # version: 398366ef07b5911d8082ed61cbf03d487a41f286 somaticsniper: + # run: True # version: 1.0.4 samtools: # version: 0.1.8 @@ -118,6 +122,7 @@ mutation_calling: #run_trinity: True #version: 1.0.1 strelka: + # run: True # version: 1.0.15 config_file: S3://protect-data/hg38_references/strelka_bwa_WXS_config.ini.tar.gz