-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Scheme creation/tree visualization #58
Changes from 58 commits
ae196e3
693a417
0970ccf
ac9a58d
24cabfe
a357f62
e7ff894
14d3fd5
a80499f
b24bf2e
842a648
e8a0f9e
5e61271
aa6b407
a3532da
578aa4c
0a048e0
1412a5e
dc1463b
6d0d7aa
ce09e25
c292223
d4cd0c9
46ee176
674c46a
0e59bdc
f81fe90
1cf1fa7
40b47cb
c5d5586
32aa088
10d4357
375f30e
83aa97c
aee9496
395792f
4d1d9c5
275637d
d4ff06c
e8ec3cf
b00437e
957fe54
a386dee
ce3335a
f68f6e9
b012ece
bad3a86
1a71e15
90a26af
e98d7b0
10c328c
dc053ca
a24132d
8758254
80e2bb2
507a105
e96e954
bfa947b
9b70aa9
0175d92
181b812
ad0706f
d141a97
b9b8857
46cbc1f
b554b06
b347944
292749a
784a18e
99966c0
b5b061a
94b2c02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,3 +145,11 @@ match_results.tab | |
results.tab | ||
test.tab | ||
|
||
# VS Code | ||
.vscode | ||
|
||
# pytest_cache | ||
.pytest_cache/* | ||
.pytest_cache | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
import logging | ||
import os | ||
from typing import Union, Optional, Tuple, List | ||
|
||
import attr | ||
import click | ||
import pandas as pd | ||
|
||
from biohansel.create.display_tree import display_tree | ||
from biohansel.create.find_cluster import find_clusters | ||
from biohansel.create.group_snvs import group_snvs | ||
from biohansel.create.read_vcf import read_vcf | ||
from biohansel.create.write_sequence import get_sequences, read_sequence_file, write_sequences | ||
from biohansel.subtype import subtype_contigs_samples, subtype_reads_samples, Subtype | ||
from biohansel.subtype.const import SUBTYPE_SUMMARY_COLS, JSON_EXT_TMPL | ||
from biohansel.subtype.metadata import read_metadata_table, merge_metadata_with_summary_results | ||
|
@@ -247,26 +253,56 @@ def parse_comma_delimited_floats(ctx: click.Context, param: click.Option, value: | |
|
||
|
||
@cli.command() | ||
@click.option('--vcf-file-path', | ||
@click.option('-v', '--vcf-file-path', | ||
required=True, | ||
type=click.Path(exists=True, dir_okay=False), | ||
help='Variant calling file (VCF) of a collection of input genomes for population of interest against a ' | ||
'reference genome that must be specified with --reference-genome-path') | ||
@click.option('--reference-genome-path', | ||
@click.option('-r', '--reference-genome-path', | ||
required=True, | ||
type=click.Path(exists=True, dir_okay=False), | ||
help='Reference genome assembly file path. The reference used in the creation of the input VCF file.') | ||
@click.option('--phylo-tree-path', | ||
required=False, | ||
type=click.Path(exists=True, dir_okay=False), | ||
help='Optional phylogenetic tree created from variant calling analysis.') | ||
@click.option('--distance-thresholds', | ||
@click.option('-d', '--distance-thresholds', | ||
required=False, | ||
type=str, | ||
callback=parse_comma_delimited_floats, | ||
help='Comma delimited list of distance thresholds for creating hierarchical clustering groups ' | ||
'(e.g. "0,0.05,0.1,0.15")') | ||
def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thresholds): | ||
@click.option('-o', '--output-folder-name', | ||
required=True, | ||
type=click.Path(exists=False, dir_okay=True), | ||
help='Output folder name in which schema file would be located' | ||
) | ||
@click.option('-s', '--schema-name', | ||
required=False, | ||
type=str, | ||
help='A unique name for the schema file that is generated, the default is just' | ||
'{bio_hansel-schema-reference_genome_name}-{schema_version}') | ||
@click.option('-m', '--schema-version', | ||
required=False, | ||
type=str, | ||
help='An optional version number for the schema file that is generated') | ||
@click.option('-p', '--padding-sequence-length', | ||
required=True, | ||
type=int, | ||
help='Output folder name in which schema file would be located' | ||
) | ||
@click.option('-f', '--reference-genome-format', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's either try to handle these formats automatically (e.g. check file extensions) or just force users to supply a FASTA file instead of the 100+ possible different formats sequences can come in. Also use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been addressed in commit 0175d92, I kept this variable since the genbank format definitely comes up alot as well. Also Biopython has a built in fasta parser, so that was the one I wanted to keep using. Let me know if that's ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, that's fine. However, please make this option optional (i.e. If a GenBank format file is supplied and it has annotations (e.g. Prokka output or RefSeq GenBank file), it may be useful to output what the markers target in a table (e.g. marker at position 123 in contig abc in genome X lies in CDS with gene name xyz and product "some protein"). This could be useful information for understanding the scheme output. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been addressed in commit b554b06 where the user has the option to specify the format, but is not required to |
||
required=True, | ||
type=str, | ||
help='Reference genome file format, i.e. fasta, genbank' | ||
) | ||
@click.option('-t', '--min-group-size', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about Let's also set a default for this value to reduce the amount of things users need to specify via the command-line and that make sense like integers between 2 and 10 (or half of number of input samples) inclusive. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been addressed in commit d141a97 |
||
required=True, | ||
type=int, | ||
help='The minimum child group size for each new subtype branching point from the parent group' | ||
) | ||
def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thresholds, output_folder_name, schema_name, | ||
reference_genome_format, padding_sequence_length, min_group_size, schema_version): | ||
"""Create a biohansel subtyping scheme. | ||
|
||
From the results of a variant calling analysis, create a biohansel subtyping with single nucleotide variants (SNV) | ||
|
@@ -276,6 +312,37 @@ def create(vcf_file_path, reference_genome_path, phylo_tree_path, distance_thres | |
click.secho(f'Reference genome file path: {reference_genome_path}', fg='red') | ||
click.secho(f'Phylogenetic tree file path: {phylo_tree_path}', fg='yellow') | ||
click.secho(f'Distance thresholds: {distance_thresholds}', fg='blue') | ||
click.secho(f'Output folder name: {output_folder_name}', fg='magenta') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove or replace all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been fixed in 181b812 |
||
click.secho(f'Scheme name: {schema_name}', fg='cyan') | ||
click.secho(f'Reference genome format: {reference_genome_format}', fg='green') | ||
click.secho(f'Min group size: {min_group_size}', fg='yellow') | ||
click.secho(f'Padding Sequence length: {padding_sequence_length}', fg='blue') | ||
logging.info(f'Creating biohansel subtyping scheme from SNVs in "{vcf_file_path}" using reference genome ' | ||
f'"{reference_genome_path}" at {distance_thresholds if distance_thresholds else "all possible"} ' | ||
f'distance threshold levels.') | ||
reference_genome_name = os.path.split(reference_genome_path)[-1] | ||
reference_genome_name = reference_genome_name.split(".")[-2] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been fixed in 181b812 |
||
|
||
if schema_version is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please set a default in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been fixed in 181b812 |
||
schema_version = "0.1.0" | ||
|
||
if schema_name is not None: | ||
schema_name = f"{schema_name}-{schema_version}" | ||
else: | ||
schema_name = f"bio_hansel-schema-{reference_genome_name}-{schema_version}" | ||
|
||
if not os.path.exists(output_folder_name): | ||
os.makedirs(output_folder_name) | ||
|
||
sequence_df, binary_df = read_vcf(vcf_file_path) | ||
groups_dict = find_clusters(binary_df, min_group_size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about outputting all the intermediate results to the output directory? Those files may be useful to the user so they can see how the clusters were determined. It would also be useful to create clusters at various group size thresholds. We could take a range of values from the command-line, e.g. . There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been addressed in d141a97 |
||
if phylo_tree_path is not None: | ||
display_tree(phylo_tree_path, groups_dict) | ||
record_dict = read_sequence_file(reference_genome_path, reference_genome_format) | ||
results_dict = group_snvs(binary_df, sequence_df, groups_dict) | ||
for group, curr_df in results_dict.items(): | ||
df_list = get_sequences(curr_df, padding_sequence_length, | ||
record_dict) | ||
write_sequences(output_folder_name, df_list, schema_name, group) | ||
output_schema_path = os.path.join(output_folder_name, f"{schema_name}.fasta") | ||
logging.info(f"Finished writing schema file to {output_schema_path}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import logging | ||
|
||
from ete3 import Tree | ||
|
||
from typing import Dict | ||
|
||
|
||
def display_tree(phylo_tree_path: str, groups_dict: Dict[str, str]) -> str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't look like this function "displays" a tree, but instead reads the contents of a (I'm guessing) Newick format file and dangerously replaces genome names with some new name, then parses the string into a ete3 Doing this is dangerous because you could be replacing more than the expected genome name for genome, group in groups_dict.items():
new_name = f"{genome}-{group}"
new_tree = new_tree.replace(genome, new_name) e.g. what if a genome is named You're already using The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this has been addressed in commit b9b8857 |
||
""" Displays the modified phylogenetic tree with subclades added to the original genome names | ||
i.e. SRR238289-1.1.2 | ||
Args: | ||
phylo_tree_path: the path to the user-defined phylogenetic tree | ||
groups_dict: the dictionary that contains the group information for each genome | ||
|
||
Returns: | ||
new_tree: the modified tree file to that displays the phylogenetic tree to the user | ||
""" | ||
|
||
with open(phylo_tree_path) as file: | ||
new_tree = file.read() | ||
for genome, group in groups_dict.items(): | ||
new_name = f"{genome}-{group}" | ||
new_tree = new_tree.replace(genome, new_name) | ||
tree_diagram = Tree(new_tree) | ||
logging.info(f"{tree_diagram}\n") | ||
|
||
return new_tree |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
from typing import Dict, Set, List | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import scipy as sp | ||
|
||
|
||
from collections import defaultdict | ||
from scipy.cluster.hierarchy import fcluster, linkage | ||
from scipy.spatial.distance import pdist | ||
|
||
|
||
def find_clusters(df: pd.DataFrame, min_group_size: int) -> Dict[str, int]: | ||
""" | ||
Takes in a vcf file and creates clusters from the scipy hierarchy clustering algorithm | ||
|
||
Example: | ||
'/path/example.vcf' -> {'mysnpsSRR2323':'1', 'mysnpsSRR232323':'2'} | ||
|
||
Args: | ||
df: contains the vcf information in the DataFrame format | ||
min_group_size: the minimum child group size for each new subtype branching point from the parent group | ||
|
||
Returns: | ||
cluster_dict: a dictionary indicating the cluster membership of each of the supplied genomes in | ||
the vcf file | ||
""" | ||
|
||
distance_matrix = compute_distance_matrix(df) | ||
clustering_array = create_linkage_array(distance_matrix) | ||
|
||
flat_clusters = output_flat_clusters(clustering_array, df.columns, distance_matrix, min_group_size) | ||
|
||
return flat_clusters | ||
|
||
|
||
def compute_distance_matrix(df: pd.DataFrame) -> np.ndarray: | ||
"""Takes in a binary SNV state DataFrame and outputs a distance matrix using the hamming method | ||
|
||
Args: | ||
df: the DataFrame that contains only the samples' binary data | ||
|
||
Returns: | ||
a matrix of pair-wise distances between samples | ||
""" | ||
|
||
return sp.spatial.distance.pdist( | ||
df.transpose(), metric='hamming') | ||
|
||
|
||
def create_linkage_array(distance_matrix: np.ndarray) -> np.ndarray: | ||
"""Takes in a distance matrix and outputs a hierarchical clustering linkage array | ||
|
||
Args: | ||
distance_matrix: a matrix of pair-wise distances between samples | ||
|
||
Returns: | ||
clustering_array: a hierarchical clustering linkage array that is calculated from the distance matrix | ||
""" | ||
|
||
clustering_array = linkage(distance_matrix, method='complete') | ||
|
||
return clustering_array | ||
|
||
|
||
def output_flat_clusters(clustering_array: np.ndarray, genomes_only: List, distance_matrix: np.ndarray, | ||
min_group_size: int) -> \ | ||
pd.DataFrame: | ||
"""Uses a set of thresholds to output a flat cluster of the linkage array | ||
|
||
Args: | ||
clustering_array: a hierarchical clustering linkage array that is calculated from the distance matrix | ||
genomes_only: an array of column names for the original SNV DataFrame | ||
distance_matrix: a matrix of pair-wise distances between samples | ||
min_group_size: the minimum child group size for each new subtype branching point from the parent group | ||
|
||
Returns: | ||
flat_clusters: an array of flat clusters from the clustering array | ||
""" | ||
clusters = np.array([ | ||
fcluster(clustering_array, t=distance, criterion='distance') | ||
for distance in np.sort(np.unique(distance_matrix))]) | ||
cluster_df = pd.DataFrame(np.array(clusters), index=np.sort(np.unique(distance_matrix))) | ||
matrix = cluster_df.transpose() | ||
matrix.index = genomes_only | ||
matrix = matrix.sort_values( | ||
by=[cluster_grouping for cluster_grouping in matrix.columns.sort_values(ascending=False)]) | ||
thresholds = [threshold for threshold in matrix.columns.sort_values(ascending=False)] | ||
|
||
cluster_dict = defaultdict(list) | ||
|
||
for threshold in thresholds: | ||
grouping = '-'.join([str(cluster) for cluster in matrix[threshold]]) | ||
cluster_dict[grouping].append(threshold) | ||
|
||
cluster_matrix = matrix[[thresholds[0] for thresholds in cluster_dict.values()]] | ||
cluster_matrix.columns = [index for index, thresholds in enumerate(cluster_matrix.columns)] | ||
|
||
cluster_matrix = cluster_matrix.drop([0], axis=1) | ||
|
||
clusters_genomes_dict = cluster_df_to_dict(cluster_matrix) | ||
|
||
hierarchical_cluster_df = pd.DataFrame( | ||
expand_sets( | ||
assign_hc_clusters(clusters_genomes_dict=clusters_genomes_dict, min_group_size=min_group_size))).fillna( | ||
'').loc[cluster_matrix.index, :] | ||
final_assigned_clusters = df_to_subtypes_dict(hierarchical_cluster_df) | ||
|
||
return final_assigned_clusters | ||
|
||
|
||
def cluster_df_to_dict(df_clusters: pd.DataFrame) -> Dict[float, Dict[int, Set[str]]]: | ||
"""Clusters dataframe to dict of threshold levels to clusters to members | ||
|
||
Args: | ||
df_clusters: The cluster dataframe with original cluster groupings | ||
|
||
Returns: | ||
clusters_genomes_dict: A dictionary of the genome cluster groupings with the thresholds used as the key | ||
""" | ||
clusters_genomes_dict = {} | ||
for threshold in df_clusters.columns: | ||
clusters = df_clusters[threshold] # type: pd.Series | ||
cluster_genomes = defaultdict(set) | ||
for genome, cluster in clusters.iteritems(): | ||
cluster_genomes[cluster].add(genome) | ||
clusters_genomes_dict[threshold] = cluster_genomes | ||
|
||
return clusters_genomes_dict | ||
|
||
|
||
def assign_hc_clusters(clusters_genomes_dict: Dict[float, Dict[int, Set[str]]], min_group_size: int) -> Dict[ | ||
str, Dict[str, Set[str]]]: | ||
""" | ||
Assigns the subtypes for each genome at each threshold level | ||
|
||
Args: | ||
clusters_genomes_dict: A dictionary of the genome cluster groupings with the thresholds used as the key | ||
min_group_size: the minimum child group size for each new subtype branching point from the parent group | ||
|
||
Returns: | ||
output_clusters: A dictionary within a dictionary that contains each subgtype and the sets of genomes within | ||
that subtype | ||
""" | ||
|
||
output_subtypes = {threshold: {} for threshold in clusters_genomes_dict.keys()} | ||
sorted_thresholds = sorted(clusters_genomes_dict.keys()) | ||
# initialize top level subtypes | ||
output_subtypes[sorted_thresholds[0]] = {str(cluster): genomes for cluster, genomes in | ||
clusters_genomes_dict[sorted_thresholds[0]].items()} | ||
|
||
for threshold_index in range(1, len(sorted_thresholds)): | ||
parent_hc_clusters = output_subtypes[sorted_thresholds[threshold_index - 1]] | ||
|
||
threshold = sorted_thresholds[threshold_index] | ||
cluster_genomes = clusters_genomes_dict[threshold] | ||
|
||
for subtype, parent_genomes in parent_hc_clusters.items(): | ||
if len(parent_genomes) < min_group_size: | ||
continue | ||
subclade = 1 | ||
for _, child_genomes in cluster_genomes.items(): | ||
if len(child_genomes) < min_group_size: | ||
continue | ||
|
||
if parent_genomes == child_genomes: | ||
output_subtypes[threshold][subtype] = parent_genomes | ||
elif child_genomes.issubset(parent_genomes): | ||
new_subgroup_name = f'{subtype}.{subclade}' | ||
if len(parent_genomes - child_genomes) >= min_group_size: | ||
output_subtypes[threshold][new_subgroup_name] = child_genomes | ||
else: | ||
output_subtypes[threshold][subtype] = child_genomes | ||
subclade += 1 | ||
|
||
return output_subtypes | ||
|
||
|
||
def expand_sets(cluster_dict: Dict[str, Dict[str, Set[str]]]) -> Dict[str, Dict[str, str]]: | ||
""" | ||
Fills in the rest of the cells in the dataframe that had previously been unfilled | ||
|
||
Args: | ||
cluster_dict: a dictionary that contains the cluster groupings of each genome with each value being sets of | ||
genomes | ||
|
||
Returns: | ||
modified_cluster_dict: A dictionary with values beings filled for all cells | ||
|
||
""" | ||
|
||
modified_cluster_dict = {} | ||
for threshold, groupings in cluster_dict.items(): | ||
threshold_dict = {} | ||
for grouping, genomes in groupings.items(): | ||
for genome in genomes: | ||
threshold_dict[genome] = grouping | ||
modified_cluster_dict[threshold] = threshold_dict | ||
|
||
return modified_cluster_dict | ||
|
||
|
||
def row_subtype(curr_genome: pd.Series) -> str: | ||
""" | ||
Provides the last valid subtype for that particular genome, as that's the last valid branch point | ||
|
||
Args: | ||
curr_genome: the current subtypes for that particular genome | ||
|
||
Returns: | ||
row_unique[-1]: the last valid subtype name for that genome | ||
|
||
""" | ||
|
||
row_unique = curr_genome.unique() | ||
row_unique = row_unique[(row_unique != '') & (~pd.isnull(row_unique))] | ||
|
||
return row_unique[-1] | ||
|
||
|
||
def df_to_subtypes_dict(cluster_genomes_df: pd.DataFrame) -> Dict[str, str]: | ||
""" | ||
Provides a dictionary of the subtypes with the genome being the key and the subtype being the value | ||
|
||
Args: | ||
cluster_genomes_df: the dataframe containing the genomes and their respective subtypes at each threshold | ||
|
||
Returns: | ||
final_cluster_dict: the dictionary that contains the final subtype assignments for each genome | ||
|
||
""" | ||
|
||
final_cluster_dict = {} | ||
for genome, row in cluster_genomes_df.apply(row_subtype, axis=1).iteritems(): | ||
final_cluster_dict[genome] = row | ||
|
||
return final_cluster_dict |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This help information does not seem to correspond to this option. Isn't
padding_sequence_length
the length of sequence to extract around each SNP?Also can you change this to
--tile-length
? Wheremath.ceil((tile_length -1) / 2)
would be the length of sequence to get around each SNV with a warning to the user if they provided an even integer value that instead of tiles being lengthn
, they will be lengthn+1
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes this has been addressed in commit 0175d92