phac-nml · gcttong · May 31, 2018 · Jun 6, 2018 · Jun 6, 2018 · Jun 12, 2018
diff --git a/.gitignore b/.gitignore
@@ -145,3 +145,11 @@ match_results.tab
 results.tab
 test.tab
 
+# VS Code
+.vscode
+
+# pytest_cache
+.pytest_cache/*
+.pytest_cache
+
+
diff --git a/biohansel/cli.py b/biohansel/cli.py
@@ -1,10 +1,16 @@
 import logging
+import os
 from typing import Union, Optional, Tuple, List
 
 import attr
 import click
 import pandas as pd
 
+from biohansel.create.display_tree import display_tree
+from biohansel.create.find_cluster import find_clusters
+from biohansel.create.group_snvs import group_snvs
+from biohansel.create.read_vcf import read_vcf
+from biohansel.create.write_sequence import get_sequences, read_sequence_file, write_sequences
 from biohansel.subtype import subtype_contigs_samples, subtype_reads_samples, Subtype
 from biohansel.subtype.const import SUBTYPE_SUMMARY_COLS, JSON_EXT_TMPL
 from biohansel.subtype.metadata import read_metadata_table, merge_metadata_with_summary_results
@@ -247,26 +253,56 @@ def parse_comma_delimited_floats(ctx: click.Context, param: click.Option, value:
 
 
 @cli.command()
-@click.option('--vcf-file-path',
+@click.option('-v', '--vcf-file-path',
               required=True,
               type=click.Path(exists=True, dir_okay=False),
               help='Variant calling file (VCF) of a collection of input genomes for population of interest against a '
                    'reference genome that must be specified with --reference-genome-path')
-@click.option('--reference-genome-path',
+@click.option('-r', '--reference-genome-path',
               required=True,
               type=click.Path(exists=True, dir_okay=False),
               help='Reference genome assembly file path. The reference used in the creation of the input VCF file.')
 @click.option('--phylo-tree-path',
               required=False,
               type=click.Path(exists=True, dir_okay=False),
               help='Optional phylogenetic tree created from variant calling analysis.')
-@click.option('--distance-thresholds',
+@click.option('-d', '--distance-thresholds',
               required=False,
               type=str,
               callback=parse_comma_delimited_floats,
               help='Comma delimited list of distance thresholds for creating hierarchical clustering groups '
                    '(e.g. "0,0.05,0.1,0.15")')
-def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thresholds):
+@click.option('-o', '--output-folder-name',
+              required=True,
+              type=click.Path(exists=False, dir_okay=True),
+              help='Output folder name in which schema file would be located'
+              )
+@click.option('-s', '--schema-name',
+              required=False,
+              type=str,
+              help='A unique name for the schema file that is generated, the default is just'
+                   '{bio_hansel-schema-reference_genome_name}-{schema_version}')
+@click.option('-m', '--schema-version',
+              required=False,
+              type=str,
+              help='An optional version number for the schema file that is generated')
+@click.option('-p', '--padding-sequence-length',
+              required=True,
+              type=int,
+              help='Output folder name in which schema file would be located'
+              )
+@click.option('-f', '--reference-genome-format',
+              required=True,
+              type=str,
+              help='Reference genome file format, i.e. fasta, genbank'
+              )
+@click.option('-t', '--min-group-size',
+              required=True,
+              type=int,
+              help='The minimum child group size for each new subtype branching point from the parent group'
+              )
+def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thresholds, output_folder_name, schema_name,
+           reference_genome_format, padding_sequence_length, min_group_size, schema_version):
     """Create a biohansel subtyping scheme.
 
     From the results of a variant calling analysis, create a biohansel subtyping with single nucleotide variants (SNV)
@@ -276,6 +312,37 @@ def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thres
     click.secho(f'Reference genome file path: {reference_genome_path}', fg='red')
     click.secho(f'Phylogenetic tree file path: {phylo_tree_path}', fg='yellow')
     click.secho(f'Distance thresholds: {distance_thresholds}', fg='blue')
+    click.secho(f'Output folder name: {output_folder_name}', fg='magenta')
+    click.secho(f'Scheme name: {schema_name}', fg='cyan')
+    click.secho(f'Reference genome format: {reference_genome_format}', fg='green')
+    click.secho(f'Min group size: {min_group_size}', fg='yellow')
+    click.secho(f'Padding Sequence length: {padding_sequence_length}', fg='blue')
     logging.info(f'Creating biohansel subtyping scheme from SNVs in "{vcf_file_path}" using reference genome '
                  f'"{reference_genome_path}" at {distance_thresholds if distance_thresholds else "all possible"} '
                  f'distance threshold levels.')
+    reference_genome_name = os.path.split(reference_genome_path)[-1]
+    reference_genome_name = reference_genome_name.split(".")[-2]
+
+    if schema_version is None:
+        schema_version = "0.1.0"
+
+    if schema_name is not None:
+        schema_name = f"{schema_name}-{schema_version}"
+    else:
+        schema_name = f"bio_hansel-schema-{reference_genome_name}-{schema_version}"
+
+    if not os.path.exists(output_folder_name):
+        os.makedirs(output_folder_name)
+
+    sequence_df, binary_df = read_vcf(vcf_file_path)
+    groups_dict = find_clusters(binary_df, min_group_size)
+    if phylo_tree_path is not None:
+        display_tree(phylo_tree_path, groups_dict)
+    record_dict = read_sequence_file(reference_genome_path, reference_genome_format)
+    results_dict = group_snvs(binary_df, sequence_df, groups_dict)
+    for group, curr_df in results_dict.items():
+        df_list = get_sequences(curr_df, padding_sequence_length,
+                                record_dict)
+        write_sequences(output_folder_name, df_list, schema_name, group)
+    output_schema_path = os.path.join(output_folder_name, f"{schema_name}.fasta")
+    logging.info(f"Finished writing schema file to {output_schema_path}")
diff --git a/biohansel/create/display_tree.py b/biohansel/create/display_tree.py
@@ -0,0 +1,27 @@
+import logging
+
+from ete3 import Tree
+
+from typing import Dict
+
+
+def display_tree(phylo_tree_path: str, groups_dict: Dict[str, str]) -> str:
+    """ Displays the modified phylogenetic tree with subclades added to the original genome names
+    i.e. SRR238289-1.1.2
+    Args:
+        phylo_tree_path: the path to the user-defined phylogenetic tree
+        groups_dict: the dictionary that contains the group information for each genome
+
+    Returns:
+        new_tree: the modified tree file to that displays the phylogenetic tree to the user
+    """
+
+    with open(phylo_tree_path) as file:
+        new_tree = file.read()
+        for genome, group in groups_dict.items():
+            new_name = f"{genome}-{group}"
+            new_tree = new_tree.replace(genome, new_name)
+        tree_diagram = Tree(new_tree)
+        logging.info(f"{tree_diagram}\n")
+
+    return new_tree
diff --git a/biohansel/create/find_cluster.py b/biohansel/create/find_cluster.py
@@ -0,0 +1,237 @@
+from typing import Dict, Set, List
+
+import numpy as np
+import pandas as pd
+import scipy as sp
+
+
+from collections import defaultdict
+from scipy.cluster.hierarchy import fcluster, linkage
+from scipy.spatial.distance import pdist
+
+
+def find_clusters(df: pd.DataFrame, min_group_size: int) -> Dict[str, int]:
+    """
+    Takes in a vcf file and creates clusters from the scipy hierarchy clustering algorithm
+
+    Example:
+    '/path/example.vcf' -> {'mysnpsSRR2323':'1', 'mysnpsSRR232323':'2'}
+
+    Args:
+        df: contains the vcf information in the DataFrame format
+        min_group_size: the minimum child group size for each new subtype branching point from the parent group
+
+    Returns:
+        cluster_dict: a dictionary indicating the cluster membership of each of the supplied genomes in
+                the vcf file
+    """
+
+    distance_matrix = compute_distance_matrix(df)
+    clustering_array = create_linkage_array(distance_matrix)
+
+    flat_clusters = output_flat_clusters(clustering_array, df.columns, distance_matrix, min_group_size)
+
+    return flat_clusters
+
+
+def compute_distance_matrix(df: pd.DataFrame) -> np.ndarray:
+    """Takes in a binary SNV state DataFrame and outputs a distance matrix using the hamming method
+
+    Args:
+        df: the DataFrame that contains only the samples' binary data
+
+    Returns:
+        a matrix of pair-wise distances between samples
+    """
+
+    return sp.spatial.distance.pdist(
+        df.transpose(), metric='hamming')
+
+
+def create_linkage_array(distance_matrix: np.ndarray) -> np.ndarray:
+    """Takes in a distance matrix and outputs a hierarchical clustering linkage array
+
+    Args:
+        distance_matrix: a matrix of pair-wise distances between samples
+
+    Returns:
+        clustering_array: a hierarchical clustering linkage array that is calculated from the distance matrix
+    """
+
+    clustering_array = linkage(distance_matrix, method='complete')
+
+    return clustering_array
+
+
+def output_flat_clusters(clustering_array: np.ndarray, genomes_only: List, distance_matrix: np.ndarray,
+                         min_group_size: int) -> \
+        pd.DataFrame:
+    """Uses a set of thresholds to output a flat cluster of the linkage array
+
+    Args:
+        clustering_array: a hierarchical clustering linkage array that is calculated from the distance matrix
+        genomes_only: an array of column names for the original SNV DataFrame
+        distance_matrix: a matrix of pair-wise distances between samples
+        min_group_size: the minimum child group size for each new subtype branching point from the parent group
+
+    Returns:
+         flat_clusters: an array of flat clusters from the clustering array
+    """
+    clusters = np.array([
+        fcluster(clustering_array, t=distance, criterion='distance')
+        for distance in np.sort(np.unique(distance_matrix))])
+    cluster_df = pd.DataFrame(np.array(clusters), index=np.sort(np.unique(distance_matrix)))
+    matrix = cluster_df.transpose()
+    matrix.index = genomes_only
+    matrix = matrix.sort_values(
+        by=[cluster_grouping for cluster_grouping in matrix.columns.sort_values(ascending=False)])
+    thresholds = [threshold for threshold in matrix.columns.sort_values(ascending=False)]
+
+    cluster_dict = defaultdict(list)
+
+    for threshold in thresholds:
+        grouping = '-'.join([str(cluster) for cluster in matrix[threshold]])
+        cluster_dict[grouping].append(threshold)
+
+    cluster_matrix = matrix[[thresholds[0] for thresholds in cluster_dict.values()]]
+    cluster_matrix.columns = [index for index, thresholds in enumerate(cluster_matrix.columns)]
+
+    cluster_matrix = cluster_matrix.drop([0], axis=1)
+
+    clusters_genomes_dict = cluster_df_to_dict(cluster_matrix)
+
+    hierarchical_cluster_df = pd.DataFrame(
+        expand_sets(
+            assign_hc_clusters(clusters_genomes_dict=clusters_genomes_dict, min_group_size=min_group_size))).fillna(
+        '').loc[cluster_matrix.index, :]
+    final_assigned_clusters = df_to_subtypes_dict(hierarchical_cluster_df)
+
+    return final_assigned_clusters
+
+
+def cluster_df_to_dict(df_clusters: pd.DataFrame) -> Dict[float, Dict[int, Set[str]]]:
+    """Clusters dataframe to dict of threshold levels to clusters to members
+
+    Args:
+        df_clusters: The cluster dataframe with original cluster groupings
+
+    Returns:
+        clusters_genomes_dict: A dictionary of the genome cluster groupings with the thresholds used as the key
+    """
+    clusters_genomes_dict = {}
+    for threshold in df_clusters.columns:
+        clusters = df_clusters[threshold]  # type: pd.Series
+        cluster_genomes = defaultdict(set)
+        for genome, cluster in clusters.iteritems():
+            cluster_genomes[cluster].add(genome)
+        clusters_genomes_dict[threshold] = cluster_genomes
+
+    return clusters_genomes_dict
+
+
+def assign_hc_clusters(clusters_genomes_dict: Dict[float, Dict[int, Set[str]]], min_group_size: int) -> Dict[
+    str, Dict[str, Set[str]]]:
+    """
+    Assigns the subtypes for each genome at each threshold level
+
+    Args:
+        clusters_genomes_dict: A dictionary of the genome cluster groupings with the thresholds used as the key
+        min_group_size: the minimum child group size for each new subtype branching point from the parent group
+
+    Returns:
+        output_clusters: A dictionary within a dictionary that contains each subgtype and the sets of genomes within
+                        that subtype
+    """
+
+    output_subtypes = {threshold: {} for threshold in clusters_genomes_dict.keys()}
+    sorted_thresholds = sorted(clusters_genomes_dict.keys())
+    # initialize top level subtypes
+    output_subtypes[sorted_thresholds[0]] = {str(cluster): genomes for cluster, genomes in
+                                             clusters_genomes_dict[sorted_thresholds[0]].items()}
+
+    for threshold_index in range(1, len(sorted_thresholds)):
+        parent_hc_clusters = output_subtypes[sorted_thresholds[threshold_index - 1]]
+
+        threshold = sorted_thresholds[threshold_index]
+        cluster_genomes = clusters_genomes_dict[threshold]
+
+        for subtype, parent_genomes in parent_hc_clusters.items():
+            if len(parent_genomes) < min_group_size:
+                continue
+            subclade = 1
+            for _, child_genomes in cluster_genomes.items():
+                if len(child_genomes) < min_group_size:
+                    continue
+
+                if parent_genomes == child_genomes:
+                    output_subtypes[threshold][subtype] = parent_genomes
+                elif child_genomes.issubset(parent_genomes):
+                    new_subgroup_name = f'{subtype}.{subclade}'
+                    if len(parent_genomes - child_genomes) >= min_group_size:
+                        output_subtypes[threshold][new_subgroup_name] = child_genomes
+                    else:
+                        output_subtypes[threshold][subtype] = child_genomes
+                    subclade += 1
+
+    return output_subtypes
+
+
+def expand_sets(cluster_dict: Dict[str, Dict[str, Set[str]]]) -> Dict[str, Dict[str, str]]:
+    """
+    Fills in the rest of the cells in the dataframe that had previously been unfilled
+
+    Args:
+        cluster_dict: a dictionary that contains the cluster groupings of each genome with each value being sets of
+                    genomes
+
+    Returns:
+        modified_cluster_dict: A dictionary with values beings filled for all cells
+
+    """
+
+    modified_cluster_dict = {}
+    for threshold, groupings in cluster_dict.items():
+        threshold_dict = {}
+        for grouping, genomes in groupings.items():
+            for genome in genomes:
+                threshold_dict[genome] = grouping
+        modified_cluster_dict[threshold] = threshold_dict
+
+    return modified_cluster_dict
+
+
+def row_subtype(curr_genome: pd.Series) -> str:
+    """
+    Provides the last valid subtype for that particular genome, as that's the last valid branch point
+
+    Args:
+        curr_genome: the current subtypes for that particular genome
+
+    Returns:
+        row_unique[-1]: the last valid subtype name for that genome
+
+    """
+
+    row_unique = curr_genome.unique()
+    row_unique = row_unique[(row_unique != '') & (~pd.isnull(row_unique))]
+
+    return row_unique[-1]
+
+
+def df_to_subtypes_dict(cluster_genomes_df: pd.DataFrame) -> Dict[str, str]:
+    """
+    Provides a dictionary of the subtypes with the genome being the key and the subtype being the value
+
+    Args:
+        cluster_genomes_df: the dataframe containing the genomes and their respective subtypes at each threshold
+
+    Returns:
+        final_cluster_dict: the dictionary that contains the final subtype assignments for each genome
+
+    """
+
+    final_cluster_dict = {}
+    for genome, row in cluster_genomes_df.apply(row_subtype, axis=1).iteritems():
+        final_cluster_dict[genome] = row
+
+    return final_cluster_dict