Skip to content

Commit

Permalink
Merge pull request #120 from phac-nml/fix/na-subtype
Browse files Browse the repository at this point in the history
Fix metadata NA value issue
  • Loading branch information
DarianHole authored Oct 2, 2019
2 parents c4d07c7 + 84fe631 commit 22c9a04
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 43 deletions.
53 changes: 23 additions & 30 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,9 @@
from .subtyper import \
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_metadata_with_summary_results
from .utils import \
genome_name_from_fasta_path, \
get_scheme_fasta, \
does_file_exist, \
collect_fastq_from_dir, \
group_fastqs, \
collect_fasta_from_dir, \
init_subtyping_params
from .metadata import read_metadata_table, merge_results_with_metadata
from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna)

SCRIPT_NAME = 'hansel'
LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
Expand Down Expand Up @@ -70,7 +64,7 @@ def init_parser():
help='input fasta file path AND genome name')
parser.add_argument('-D', '--input-directory',
help='directory of input fasta files (.fasta|.fa|.fna) or FASTQ files (paired FASTQ should '
'have same basename with "_\d\.(fastq|fq)" postfix to be automatically paired) '
'have same basename with "_\\d\\.(fastq|fq)" postfix to be automatically paired) '
'(files can be Gzipped)')
parser.add_argument('-o', '--output-summary',
help='Subtyping summary output path (tab-delimited)')
Expand Down Expand Up @@ -126,7 +120,7 @@ def collect_inputs(args: Any) -> Tuple[List[Tuple[str, str]], List[Tuple[List[st
"""Collect all input files for analysis
Sample names are derived from the base filename with no extensions.
Sequencing reads are paired if they share a common filename name without "_\d".
Sequencing reads are paired if they share a common filename name without "_\\d".
Filepaths for contigs and reads files are collected from an input directory if provided.
Args:
Expand Down Expand Up @@ -172,7 +166,7 @@ def collect_inputs(args: Any) -> Tuple[List[Tuple[str, str]], List[Tuple[List[st
continue
filenames = [os.path.basename(y) for y in x]
common_prefix = os.path.commonprefix(filenames)
genome_name = re.sub(r'[\W\_]+$', r'', common_prefix)
genome_name = re.sub(r'[\W_]+$', r'', common_prefix)
if genome_name == '':
genome_name = filenames[0]
reads.append((x, genome_name))
Expand All @@ -192,32 +186,31 @@ def main():
does_file_exist(output_simple_summary_path, args.force)
does_file_exist(output_summary_path, args.force)
does_file_exist(output_kmer_results, args.force)
scheme = args.scheme # type: str
scheme_name = args.scheme_name # type: Optional[str]
scheme: str = args.scheme
scheme_name: Optional[str] = args.scheme_name
scheme_fasta = get_scheme_fasta(scheme)
scheme_subtype_counts = subtype_counts(scheme_fasta)
directory_path = args.input_directory
logging.debug(args)
subtyping_params = init_subtyping_params(args, scheme)
input_contigs, input_reads = collect_inputs(args)
if len(input_contigs) == 0 and len(input_reads) == 0:
raise Exception('No input files specified!')

df_md = None
try:
df_md = read_metadata_table(resource_filename(program_name, f'data/{scheme}/metadata.tsv'))
except Exception:
pass

md_path = resource_filename(program_name, f'data/{scheme}/metadata.tsv')
if os.path.exists(md_path):
df_md = read_metadata_table(md_path)

if args.scheme_metadata:
if df_md is None:
df_md = pd. DataFrame()
df_md = pd.concat([df_md, read_metadata_table(args.scheme_metadata)], axis=1)
df_md = df_md.loc[:, ~df_md.columns.duplicated()]
df_md = read_metadata_table(args.scheme_metadata)
else:
df_md = pd.concat([df_md, read_metadata_table(args.scheme_metadata)], axis=1)
df_md = df_md.loc[:, ~df_md.columns.duplicated()]

n_threads = args.threads

subtype_results: List[Tuple[Subtype, pd.DataFrame]] = [] # type: List[Tuple[Subtype, pd.DataFrame]]
subtype_results: List[Tuple[Subtype, pd.DataFrame]] = []
if len(input_contigs) > 0:
contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
scheme=scheme,
Expand All @@ -237,18 +230,18 @@ def main():
logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
subtype_results += reads_results

dfs: List[pd.DataFrame] = [df for st, df in subtype_results] # type: List[pd.DataFrame]
dfs: List[pd.DataFrame] = [df for st, df in subtype_results]
dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])

dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

if dfsummary['avg_kmer_coverage'].isnull().all():
dfsummary = dfsummary.drop(labels='avg_kmer_coverage', axis=1)

dfsummary['subtype'].fillna(value='#N/A', inplace=True)
dfsummary = df_field_fillna(dfsummary)

if df_md is not None:
dfsummary = merge_metadata_with_summary_results(dfsummary, df_md)
dfsummary = merge_results_with_metadata(dfsummary, df_md)

kwargs_for_pd_to_table = dict(sep='\t', index=None, float_format='%.3f')
kwargs_for_pd_to_json = dict(orient='records')
Expand All @@ -264,8 +257,8 @@ def main():

if output_kmer_results:
if len(dfs) > 0:
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False) # type: pd.DataFrame
dfall['subtype'].fillna(value='#N/A', inplace=True)
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False)
dfall = df_field_fillna(dfall)
dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table)
logging.info('Kmer results written to "{}".'.format(output_kmer_results))
if args.json:
Expand All @@ -283,7 +276,7 @@ def main():
df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]

if df_md is not None:
df_simple_summary = merge_metadata_with_summary_results(df_simple_summary, df_md)
df_simple_summary = merge_results_with_metadata(df_simple_summary, df_md)

df_simple_summary.to_csv(output_simple_summary_path, **kwargs_for_pd_to_table)
if args.json:
Expand Down
9 changes: 4 additions & 5 deletions bio_hansel/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@ def read_metadata_table(path: str) -> Optional[pd.DataFrame]:
list(FILE_EXT_TO_PD_READ_FUNC.keys())
))
return None
dfmd = FILE_EXT_TO_PD_READ_FUNC[file_ext](path) # type: pd.DataFrame
dfmd: pd.DataFrame = FILE_EXT_TO_PD_READ_FUNC[file_ext](path)
assert np.any(dfmd.columns == 'subtype'), 'Column with name "subtype" expected in metadata file "{}"'.format(path)
dfmd.subtype.fillna('#N/A', inplace=True)
dfmd.subtype = dfmd.subtype.astype(str)
logging.info('Read scheme metadata file "{}" into DataFrame with shape {}'.format(path, dfmd.shape))
return dfmd


def merge_metadata_with_summary_results(df_results: pd.DataFrame, df_metadata: pd.DataFrame) -> pd.DataFrame:
"""Merge subtype metadata table into subtype results table.
def merge_results_with_metadata(df_results: pd.DataFrame, df_metadata: pd.DataFrame) -> pd.DataFrame:
"""Merge subtype results table with metadata table.
Args:
df_results: Subtyping results table.
Expand All @@ -53,6 +54,4 @@ def merge_metadata_with_summary_results(df_results: pd.DataFrame, df_metadata: p
Returns:
Subtyping results with subtype metadata merged in if metadata is present for subtype results.
"""
df_results.subtype = df_results.subtype.fillna('')
df_results.subtype = df_results.subtype.astype(str)
return pd.merge(df_results, df_metadata, how='left', on='subtype')
2 changes: 1 addition & 1 deletion bio_hansel/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def parse_fastq(filepath):
with os.popen('zcat < {}'.format(filepath)) as f:
yield from _parse_fastq(f)
else:
with open(filepath, 'rU') as f:
with open(filepath, 'r') as f:
yield from _parse_fastq(f)


Expand Down
9 changes: 9 additions & 0 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
from collections import defaultdict

import pandas as pd

from .const import SCHEME_FASTAS, REGEX_FASTQ, REGEX_FASTA
from .subtyping_params import SubtypingParams

Expand Down Expand Up @@ -204,3 +206,10 @@ def init_subtyping_params(args: Optional[Any] = None,
subtyping_params.max_degenerate_kmers = args.max_degenerate_kmers

return subtyping_params


def df_field_fillna(df: pd.DataFrame, field:str = 'subtype', na: str = '#N/A') -> pd.DataFrame:
df[field].replace('', na, inplace=True)
df[field].fillna(value=na, inplace=True)
df[field] = df[field].astype(str)
return df
6 changes: 6 additions & 0 deletions tests/data/subtype-metadata-with-na.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
subtype a b c
1 a1 b1 c1
1.1 a1.1 b1.1 c1.1
2.2.1.1.1.1 a2.2.1.1.1.1 b2.2.1.1.1.1 c2.2.1.1.1.1
2.2.2.2.1.4 a2.2.2.2.1.4 b2.2.2.2.1.4 c2.2.2.2.1.4
#N/A a b c
34 changes: 27 additions & 7 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,39 @@
import pandas as pd
import numpy as np

from bio_hansel.metadata import read_metadata_table, merge_metadata_with_summary_results
from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata
from bio_hansel.utils import df_field_fillna


def test_read_metadata():
df_results = pd.read_table('tests/data/subtyping-results.tsv')
df_results = df_field_fillna(df_results)
df_md = read_metadata_table('tests/data/subtype-metadata.tsv')
df_merged = merge_metadata_with_summary_results(df_results, df_md)
df_merged = merge_results_with_metadata(df_results, df_md)
assert np.all(df_md.columns.isin(df_merged.columns))
for idx, row in df_merged.iterrows():
# for metadata columns a,b,c, the merged metadata value should be the column name + subtype
# e.g. for subtype 1.1, a=a1.1; b=b1.1; c=c1.1
# if there was no subtype result, then the metadata field value should be NaN/None
for column in ['a', 'b', 'c']:
if row['subtype'] != '#N/A':
assert row[column] == column + row['subtype']
else:
assert np.isnan(row[column])


def test_read_metadata_with_na_subtype():
df_results = pd.read_table('tests/data/subtyping-results.tsv')
df_results = df_field_fillna(df_results)
df_md = read_metadata_table('tests/data/subtype-metadata-with-na.tsv')
df_merged = merge_results_with_metadata(df_results, df_md)
assert np.all(df_md.columns.isin(df_merged.columns))
for i, r in df_merged.iterrows():
for idx, row in df_merged.iterrows():
# for metadata columns a,b,c, the merged metadata value should be the column name + subtype
# e.g. for subtype 1.1, a=a1.1; b=b1.1; c=c1.1
# if there was no subtype result, then the metadata field value should be NaN/None
for c in list('abc'):
if r['subtype'] != '':
assert r[c] == c + r['subtype']
for column in ['a', 'b', 'c']:
if row['subtype'] != '#N/A':
assert row[column] == column + row['subtype']
else:
assert np.isnan(r[c])
assert row[column] == column, f'row={row}'

0 comments on commit 22c9a04

Please sign in to comment.