Skip to content

Commit

Permalink
v1.9.47: refactoring + a command to name the tree
Browse files Browse the repository at this point in the history
  • Loading branch information
annazhukova committed Sep 11, 2024
1 parent 2b86711 commit 5080c5e
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 55 deletions.
15 changes: 15 additions & 0 deletions pastml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from datetime import datetime

PASTML_VERSION = '1.9.47'

METHOD = 'method'
STATES = 'states'
CHARACTER = 'character'
Expand Down Expand Up @@ -89,3 +92,15 @@ def value2list(n, value, default_value):
else:
value += [default_value] * (n - len(value))
return value


def _set_up_pastml_logger(verbose):
logger = logging.getLogger('pastml')
logger.setLevel(level=logging.DEBUG if verbose else logging.ERROR)
logger.propagate = False
if not logger.hasHandlers():
ch = logging.StreamHandler()
formatter = logging.Formatter('%(name)s:%(levelname)s:%(asctime)s %(message)s', datefmt="%H:%M:%S")
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
43 changes: 5 additions & 38 deletions pastml/acr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import sys
import warnings
from collections import defaultdict, Counter
from multiprocessing.pool import ThreadPool
Expand All @@ -11,7 +12,7 @@
from ete3 import Tree

from pastml import col_name2cat, value2list, STATES, METHOD, CHARACTER, get_personalized_feature_name, numeric2datetime, \
datetime2numeric
PASTML_VERSION, _set_up_pastml_logger
from pastml.annotation import preannotate_forest, ForestStats
from pastml.file import get_combined_ancestral_state_file, get_named_tree_file, get_pastml_parameter_file, \
get_pastml_marginal_prob_file, get_pastml_work_dir
Expand All @@ -27,15 +28,13 @@
from pastml.parsimony import is_parsimonious, parsimonious_acr, ACCTRAN, DELTRAN, DOWNPASS, MP_METHODS, MP, \
get_default_mp_method
from pastml.tree import name_tree, annotate_dates, DATE, read_forest, DATE_CI, resolve_trees, IS_POLYTOMY, \
unresolve_trees, clear_extra_features
unresolve_trees, clear_extra_features, parse_date
from pastml.visualisation import get_formatted_date
from pastml.visualisation.cytoscape_manager import visualize, TIMELINE_SAMPLED, TIMELINE_NODES, TIMELINE_LTT, \
DIST_TO_ROOT_LABEL, DATE_LABEL
from pastml.visualisation.itol_manager import generate_itol_annotations
from pastml.visualisation.tree_compressor import REASONABLE_NUMBER_OF_TIPS, VERTICAL, HORIZONTAL, TRIM

PASTML_VERSION = '1.9.46'

model2class = {F81: F81Model, JC: JCModel, CUSTOM_RATES: CustomRatesModel, HKY: HKYModel, JTT: JTTModel, EFT: EFTModel}

warnings.filterwarnings("ignore", append=True)
Expand Down Expand Up @@ -675,17 +674,6 @@ def parse_col_val(cv):
pool.close()


def parse_date(d):
try:
return float(d)
except ValueError:
try:
return datetime2numeric(pd.to_datetime(d, infer_datetime_format=True))
except ValueError:
raise ValueError('Could not infer the date format for root date "{}", please check it.'
.format(d))


def _validate_input(tree_nwk, columns=None, name_column=None, data=None, data_sep='\t', id_index=0,
root_dates=None, copy_only=False, parameters=None, rates=None):
logger = logging.getLogger('pastml')
Expand All @@ -700,16 +688,6 @@ def _validate_input(tree_nwk, columns=None, name_column=None, data=None, data_se
columns = [columns]

roots = read_forest(tree_nwk, columns=columns if data is None else None)
num_neg = 0
for root in roots:
for _ in root.traverse():
if _.dist < 0:
num_neg += 1
_.dist = 0
if num_neg:
logger.warning('Input tree{} contained {} negative branches: we put them to zero.'
.format('s' if len(roots) > 0 else '', num_neg))
logger.debug('Read the tree{} {}.'.format('s' if len(roots) > 0 else '', tree_nwk))

column2annotated = Counter()
column2states = defaultdict(set)
Expand Down Expand Up @@ -882,18 +860,6 @@ def _serialize_predicted_states(columns, out_data, roots, dates_are_dates=True):
return pd.DataFrame(index=ids, data=data, columns=['dist', DATE] + columns)


def _set_up_pastml_logger(verbose):
logger = logging.getLogger('pastml')
logger.setLevel(level=logging.DEBUG if verbose else logging.ERROR)
logger.propagate = False
if not logger.hasHandlers():
ch = logging.StreamHandler()
formatter = logging.Formatter('%(name)s:%(levelname)s:%(asctime)s %(message)s', datefmt="%H:%M:%S")
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger


def main():
"""
Entry point, calling :py:func:`pastml.acr.pastml_pipeline` with command-line arguments.
Expand Down Expand Up @@ -1090,7 +1056,8 @@ def main():
out_group.add_argument('-v', '--verbose', action='store_true',
help="print information on the progress of the analysis (to console)")

parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=PASTML_VERSION))
parser.add_argument('--version', action='version',
version='%(prog)s {version}'.format(version=PASTML_VERSION))

parser.add_argument('--threads', required=False, default=0, type=int,
help="Number of threads PastML can use for parallesation. "
Expand Down
6 changes: 3 additions & 3 deletions pastml/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,13 +881,13 @@ def optimise_likelihood(forest, character, model, observed_frequencies):
if not model.get_num_params():
logger.debug('All the parameters are fixed for {}:\n{}{}.'
.format(character,
model._print_parameters,
model._print_parameters(),
'\tlog likelihood:\t{:.6f}'.format(likelihood))
)
else:
logger.debug('Initial values for {} parameter optimisation:\n{}{}.'
.format(character,
model._print_parameters,
model._print_parameters(),
'\tlog likelihood:\t{:.6f}'.format(likelihood))
)
if not model.basic_params_fixed():
Expand Down Expand Up @@ -918,7 +918,7 @@ def optimise_likelihood(forest, character, model, observed_frequencies):
.format(character))
logger.debug('Optimised parameters for {}:\n{}{}'
.format(character,
model._print_parameters,
model._print_parameters(),
'\tlog likelihood:\t{:.6f}'.format(likelihood)))
return likelihood

Expand Down
109 changes: 96 additions & 13 deletions pastml/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from Bio import Phylo
from ete3 import Tree, TreeNode
from ete3.parser.newick import NewickError
import pandas as pd
from pastml import datetime2numeric, PASTML_VERSION, _set_up_pastml_logger

POSTORDER = 'postorder'

Expand All @@ -19,13 +21,24 @@

DATE_REGEX = r'[+-]*[\d]+[.\d]*(?:[e][+-][\d]+){0,1}'
DATE_COMMENT_REGEX = '[&,:]date[=]["]{{0,1}}({})["]{{0,1}}'.format(DATE_REGEX)
CI_DATE_REGEX_LSD = '[&,:]CI_date[=]["]{{0,1}}[{{]{{0,1}}({})\s*[,;]{{0,1}}\s*({})[}}]{{0,1}}["]{{0,1}}'.format(DATE_REGEX, DATE_REGEX)
CI_DATE_REGEX_LSD = '[&,:]CI_date[=]["]{{0,1}}[{{]{{0,1}}({})\s*[,;]{{0,1}}\s*({})[}}]{{0,1}}["]{{0,1}}'.format(
DATE_REGEX, DATE_REGEX)
CI_DATE_REGEX_PASTML = '[&,:]date_CI[=]["]{{0,1}}({})[|]({})["]{{0,1}}'.format(DATE_REGEX, DATE_REGEX)
COLUMN_REGEX_PASTML = '[&,]{column}[=]([^]^,]+)'

IS_POLYTOMY = 'polytomy'


def parse_date(d):
try:
return float(d)
except ValueError:
try:
return datetime2numeric(pd.to_datetime(d, infer_datetime_format=True))
except ValueError:
raise ValueError('Could not infer the date format for root date "{}", please check it.'.format(d))


def get_dist_to_root(tip):
dist_to_root = 0
n = tip
Expand All @@ -35,16 +48,19 @@ def get_dist_to_root(tip):
return dist_to_root


def annotate_dates(forest, root_dates=None):
def annotate_dates(forest, root_dates=None, annotate_zeros=True):
if root_dates is None:
root_dates = [0] * len(forest)
root_dates = [None] * len(forest)
for tree, root_date in zip(forest, root_dates):
for node in tree.traverse('preorder'):
if getattr(node, DATE, None) is None:
if node.is_root():
node.add_feature(DATE, root_date if root_date else 0)
if root_date is not None or annotate_zeros:
node.add_feature(DATE, root_date if root_date else 0)
else:
node.add_feature(DATE, getattr(node.up, DATE) + node.dist)
parent_date = getattr(node.up, DATE, None)
if parent_date is not None:
node.add_feature(DATE, parent_date + node.dist)
else:
node.add_feature(DATE, float(getattr(node, DATE)))
ci = getattr(node, DATE_CI, None)
Expand Down Expand Up @@ -158,17 +174,29 @@ def remove_certain_leaves(tr, to_remove=lambda node: False):


def read_forest(tree_path, columns=None):
roots = None
try:
roots = parse_nexus(tree_path, columns=columns)
if roots:
return roots
except:
pass
with open(tree_path, 'r') as f:
nwks = f.read().replace('\n', '').split(';')
if not nwks:
raise ValueError('Could not find any trees (in newick or nexus format) in the file {}.'.format(tree_path))
return [read_tree(nwk + ';', columns) for nwk in nwks[:-1]]
if not roots:
with open(tree_path, 'r') as f:
nwks = f.read().replace('\n', '').split(';')
if not nwks:
raise ValueError('Could not find any trees (in newick or nexus format) in the file {}.'.format(tree_path))
roots = [read_tree(nwk + ';', columns) for nwk in nwks[:-1]]

num_neg = 0
for root in roots:
for _ in root.traverse():
if _.dist < 0:
num_neg += 1
_.dist = 0
if num_neg:
logging.getLogger('pastml').warning('Input tree{} contained {} negative branches: we put them to zero.'
.format('s' if len(roots) > 0 else '', num_neg))
logging.getLogger('pastml').debug('Read the tree{} {}.'.format('s' if len(roots) > 0 else '', tree_path))
return roots


def read_tree(tree_path, columns=None):
Expand Down Expand Up @@ -455,7 +483,7 @@ def remove_node(n):
if num_removed_nodes:
logging.getLogger('pastml').debug(
'Removed {} polytomy resolution{} as inconsistent with model parameters.'
.format(num_removed_nodes, 's' if num_removed_nodes > 1 else ''))
.format(num_removed_nodes, 's' if num_removed_nodes > 1 else ''))
if num_new_nodes:
logging.getLogger('pastml').debug(
'Created {} new polytomy resolution{}.'.format(num_new_nodes, 's' if num_new_nodes > 1 else ''))
Expand Down Expand Up @@ -491,3 +519,58 @@ def copy_forest(forest, features=None):
for c in n.children:
todo.append((c, copied_n.add_child()))
return copied_forest


def main():
"""
Entry point, calling :py:func:`pastml.tree.name_tree` with command-line arguments.
:return: void
"""
import argparse

parser = argparse.ArgumentParser(description="Name internal tree nodes as PastMl would.", prog='name_tree')

parser.add_argument('-i', '--in_tree', help="input tree(s) in newick or nexus format (must be rooted).",
type=str, required=True)

parser.add_argument('-o', '--out_tree', required=False, type=str,
help="path where to save the named output tree(s) in newick format.")

parser.add_argument('--root_date', required=False, default=None,
help="date(s) of the root(s) (for dated tree(s) only), "
"if specified, the corresponding dates will be added to the tree node annotations.",
type=str, nargs='*')
parser.add_argument('-c', '--columns', nargs='*',
help="names of the annotation columns of the input tree (if any) "
"to be kept in the output tree. "
"If there are LSD2-like date-related columns ({} and {}) present in the tree, "
"they will be kept in any case, so no need to specify them among columns here."
.format(DATE, DATE_CI),
type=str)
parser.add_argument('--version', action='version',
version='%(prog)s {version}'.format(version=PASTML_VERSION))

params = parser.parse_args()

roots = read_forest(params.in_tree, columns=params.columns)
root_dates = None
if params.root_date is not None:
root_dates = [parse_date(d) for d in params.root_date]
if 1 < len(root_dates) < len(roots):
raise ValueError('{} trees are given, but only {} root dates.'.format(len(roots), len(root_dates)))
elif 1 == len(root_dates):
root_dates *= len(roots)
annotate_dates(roots, root_dates=root_dates, annotate_zeros=False)
for i, tree in enumerate(roots):
name_tree(tree, suffix='' if len(roots) == 1 else '_{}'.format(i))
with open(params.out_tree, 'w+') as f:
f.write(
'\n'.join(
[root.write(format_root_node=True, format=3,
features=[DATE, DATE_CI] + (params.columns if params.columns else []))
for root in roots]))


if '__main__' == __name__:
main()
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3 :: Only',
],
version='1.9.46',
version='1.9.47',
description='Ancestral character reconstruction and visualisation for rooted phylogenetic trees.',
author='Anna Zhukova',
author_email='[email protected]',
Expand All @@ -36,6 +36,7 @@
'pastml = pastml.acr:main',
'geomap = pastml.visualisation.generate_geomap:main',
'transition_counter = pastml.utilities.transition_counter:main',
'name_tree = pastml.tree:main'
]
},
)

0 comments on commit 5080c5e

Please sign in to comment.