Skip to content

OrthoDBv10 #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ dammit -- a simple de novo transcriptome annotator
Copyright (C) 2015-2018 Camille Scott
<[email protected]>

Copyright (C) 2019 Richard Meitern
<[email protected]>

All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Expand Down
8 changes: 4 additions & 4 deletions dammit/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,17 +99,17 @@ def run_annotation(handler):
Args:
handler (handler.TaskHandler): Handler with tasks for the pipeline.
'''
print(ui.header('Annotation', level=3))
print(ui.header('Info', level=4))
print(ui.header('Annotation', level=3),flush=True)
print(ui.header('Info', level=4),flush=True)
info = {'Doit Database': handler.dep_file,
'Input Transcriptome': handler.files['transcriptome']}
print(ui.listing(info))
print(ui.listing(info),flush=True)
msg = '*All annotation tasks up-to-date.*'
uptodate, statuses = handler.print_statuses(uptodate_msg=msg)
if not uptodate:
return handler.run()
else:
print('**Pipeline is already completed!**')
print('**Pipeline is already completed!**',flush=True)
sys.exit(0)


Expand Down
17 changes: 13 additions & 4 deletions dammit/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def __init__(self, arg_src=sys.argv[1:]):
self.config_d.update(vars(self.args))

def run(self):
print(ui.header('dammit'))
print(ui.header('dammit'),flush=True)
print(ui.header(__description__, level=2))
about = '\nby {0}\n\n**v{1}**, {2}\n'.format(', '.join(__authors__),
__version__, __date__)
print(about)
print(about,flush=True)
return self.args.func()

def description(self):
Expand Down Expand Up @@ -91,6 +91,15 @@ def add_common_args(parser):
' Full list of options is below.'
)

parser.add_argument('--orthodb-group',
default='metazoa',
metavar='[metazoa, arthropoda, vertebrata, ...]',
choices=list(self.databases_d['OrthoDB'].keys()),
help='Which orthoDB group to use. Should be chosen'\
' based on the organism being annotated.'\
' Full list of options is below.'
)

parser.add_argument('--n_threads',
type=int,
default=1,
Expand Down Expand Up @@ -273,7 +282,7 @@ def handle_migrate(self):

def handle_databases(self):
log.start_logging()
print(ui.header('submodule: databases', level=2))
print(ui.header('submodule: databases', level=2),flush=True)

handler = databases.get_handler(self.config_d)
if self.args.quick:
Expand All @@ -293,7 +302,7 @@ def handle_databases(self):

def handle_annotate(self):
log.start_logging()
print(ui.header('submodule: annotate', level=2))
print(ui.header('submodule: annotate', level=2),flush=True)

db_handler = databases.get_handler(self.config_d)

Expand Down
5 changes: 5 additions & 0 deletions dammit/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
"output_suffix": ".busco.results",
"params": []
},
"orthodb": {
"db_dir": "orthoDBv10",
"output_suffix": ".ortho.results",
"params": []
},

"hmmer": {
"hmmscan": [],
Expand Down
94 changes: 78 additions & 16 deletions dammit/databases.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,86 @@
},

"OrthoDB": {
"access": "download",
"db_type": "prot",
"filename": "aa_seq_euk.fasta",
"md5": "f40da35b9135c5f326380089f0ddb7a2",
"fileformat": "gz",
"version": 8,
"url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/FASTA/aa_seq_euk.fasta.gz"

"arthropoda":{
"access": "download",
"db_type": "prot",
"folder": "odb10_arthropoda",
"md5": null,
"fileformat": "tar.gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10_arthropoda_fasta.tar.gz"
},
"metazoa":{
"access": "download",
"db_type": "prot",
"folder": "odb10_metazoa",
"md5": null,
"fileformat": "tar.gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10_metazoa_fasta.tar.gz"
},
"vertebrata":{
"access": "download",
"db_type": "prot",
"folder": "odb10_vertebrata",
"md5": null,
"fileformat": "tar.gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10_vertebrata_fasta.tar.gz"
},
"protozoa":{
"access": "download",
"db_type": "prot",
"folder": "odb10_protozoa",
"md5": null,
"fileformat": "tar.gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10_protozoa_fasta.tar.gz"
},

"genes": {
"access": "download",
"db_type": "txt",
"filename": "odb10v0_genes.tab",
"md5": "34b023f5334b124ed846ae526fbac8fc",
"fileformat": "gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10v0_genes.tab.gz"
},

"og2genes": {
"access": "download",
"db_type": "txt",
"filename": "odb10v0_OG2genes.tab",
"md5": "d16c8a2272d8e35684279c4d6e0ed1a0",
"fileformat": "gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10v0_OG2genes.tab.gz"
},
"og2xrefs": {
"access": "download",
"db_type": "txt",
"filename": "odb10v0_OG_xrefs.tab",
"md5": "6d1279b4e4819b5d95c2e0ec81a53417",
"fileformat": "gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10v0_OG_xrefs.tab.gz"
},

"ogs": {
"access": "download",
"db_type": "txt",
"filename": "odb10v0_OGs.tab",
"md5": "b9d6cf78cfa401f2ee8c99627b95a8af",
"fileformat": "gz",
"version": 10,
"url": "https://v100.orthodb.org/download/odb10v0_OGs.tab.gz"
}

},

"orthodb_genes": {
"access": "download",
"db_type": "txt",
"filename": "ODB8_EukOGs_genes_ALL_levels.txt",
"md5": null,
"fileformat": "gz",
"version": 8,
"url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/Genes_to_OGs/ODB8_EukOGs_genes_ALL_levels.txt.gz"
},


"BUSCO": {
"deltaepsilonsub": {
Expand Down
55 changes: 32 additions & 23 deletions dammit/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dammit.tasks.infernal import CMPressTask
from dammit.tasks.shell import (get_download_and_gunzip_task,
get_download_and_untar_task,
get_unexisting_folder_cat_task,
get_download_task,
get_gunzip_task)

Expand Down Expand Up @@ -76,25 +77,25 @@ def print_meta(handler):
handler (handler.TaskHandler): The database task handler.
'''

print(ui.header('Info', level=4))
print(ui.header('Info', level=4),flush=True)
info = {'Doit Database': handler.dep_file,
'Database Directory': handler.directory}
print(ui.listing(info))
print(ui.listing(info),flush=True)


def install(handler):
'''Run the database prep pipeline from the given handler.
'''

print(ui.header('Database Install', level=3))
print(ui.header('Database Install', level=3),flush=True)
print_meta(handler)
msg = '*All database tasks up-to-date.*'
uptodate, statuses = handler.print_statuses(uptodate_msg=msg)
if not uptodate:
print('Installing...')
print('Installing...',flush=True)
return handler.run()
else:
print('Nothing to install!')
print('Nothing to install!',flush=True)
return 0


Expand All @@ -103,7 +104,7 @@ def check_or_fail(handler):
with status 2.
'''

print(ui.header('Database Check', level=3))
print(ui.header('Database Check', level=3),flush=True)
print_meta(handler)
msg = '*All database tasks up-to-date.*'
uptodate, statuses = handler.print_statuses(uptodate_msg=msg)
Expand All @@ -113,7 +114,7 @@ def check_or_fail(handler):
' already installed them, make sure you\'ve given'
' the correct location to `--database-dir` or have'
' exported the $DAMMIT_DB_DIR environment'
' variable.'))
' variable.'),flush=True)
sys.exit(2)


Expand All @@ -134,7 +135,7 @@ def build_default_pipeline(handler, config, databases, with_uniref=False, with_n

register_pfam_tasks(handler, config['hmmer']['hmmpress'], databases)
register_rfam_tasks(handler, config['infernal']['cmpress'], databases)
register_orthodb_tasks(handler, config['last']['lastdb'], databases)
register_orthodb_tasks(handler, config, databases)
register_busco_tasks(handler, config, databases)
register_sprot_tasks(handler, config['last']['lastdb'], databases)
if with_uniref:
Expand Down Expand Up @@ -193,26 +194,34 @@ def register_rfam_tasks(handler, params, databases):
return handler


def register_orthodb_tasks(handler, params, databases):
def register_orthodb_tasks(handler, config, databases):
#TODO get OrthoDBversion spesific
#Prepare variables
last_db_params = config['last']['lastdb']
orthodb = databases['OrthoDB']
archive_fn = '{0}.{1}'.format(orthodb['filename'],
orthodb['fileformat'])
target_fn = path.join(handler.directory, orthodb['filename'])

dl_task = get_download_task(orthodb['url'],
archive_fn,
md5=orthodb['md5'])
gz_task = get_gunzip_task(archive_fn, target_fn)

handler.register_task('download:OrthoDB', dl_task,
files={'OrthoDB-gz': archive_fn})
handler.register_task('gunzip:OrthoDB', gz_task,
files={'OrthoDB': target_fn})
orthodb_dir = path.join(handler.directory, config['orthodb']['db_dir'])
group_name = config['orthodb_group']
group = orthodb[group_name]
files = {'orthoDB-{0}'.format(group_name): path.join(orthodb_dir, group['folder'])}

target_dir = orthodb_dir + "/"+ group_name + "/"+ group_name
target_fn = orthodb_dir + "/"+ group_name+ "/"+ group_name

dl_task = get_download_and_untar_task(group['url'], orthodb_dir, label=group_name)

#TODO take the folder name from json not from the untar task as the folder that isa untar'ed might have a diffrent name (problem with vertebrata/e eg)
cat_task = get_unexisting_folder_cat_task(orthodb_dir + "/"+ group_name + '/Rawdata',target_fn)


#Register tasks
handler.register_task('download_and_untar:OrthoDB', dl_task, files=files)
handler.register_task('join_files:OrthDB',cat_task, files={'OrthoDB': target_dir})
#TODO remove target_dir/Rawdata
handler.register_task('lastdb:OrthoDB',
LastDBTask().task(target_fn,
target_fn,
prot=True,
params=params))
params=last_db_params))
return handler


Expand Down
12 changes: 6 additions & 6 deletions dammit/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ def print_statuses(self, uptodate_msg='All tasks up-to-date!',

uptodate, statuses = self.check_uptodate()
if uptodate:
print(ui.paragraph(uptodate_msg))
print(ui.paragraph(uptodate_msg),flush=True)
else:
print(ui.paragraph(outofdate_msg))
uptodate_list = [t for t,s in statuses.items() if s is True]
outofdate_list = [t for t,s in statuses.items() if s is False]
if uptodate_list:
print('\nUp-to-date tasks:')
print(ui.listing(uptodate_list))
print('\nUp-to-date tasks:',flush=True)
print(ui.listing(uptodate_list),flush=True)
if outofdate_list:
print('\nOut-of-date tasks:')
print(ui.listing(outofdate_list))
print('\nOut-of-date tasks:',flush=True)
print(ui.listing(outofdate_list),flush=True)
return uptodate, statuses

def check_uptodate(self):
Expand Down Expand Up @@ -207,7 +207,7 @@ def run(self, doit_args=None, verbose=True):
int: Exit status of the doit command.
'''
if verbose:
print(ui.header('Run Tasks', level=4))
print(ui.header('Run Tasks', level=4),flush=True)
if doit_args is None:
doit_args = ['run']
if self.n_threads > 1:
Expand Down
2 changes: 1 addition & 1 deletion dammit/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def run(filename=None, test=False):
filename = log_file
if test is True:
filename = os.path.join(log_dir, 'dammit-tests.log')
print('Logger in testing mode:', filename)
print('Logger in testing mode:', filename,flush=True)
logging.basicConfig(level=logging.DEBUG, **config)

run_handler = logging.FileHandler(filename)
Expand Down
2 changes: 1 addition & 1 deletion dammit/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def start_profiler(self, filename=None, blockname='__main__'):
self.blockname = blockname
self.running = True
self.lock = filelock.FileLock('{0}.lock'.format(self.filename))
print('Profiling is ON:', self.filename, '\n', file=sys.stderr)
print('Profiling is ON:', self.filename, '\n', file=sys.stderr,flush=True)

def write_result(self, task_name, start_time, end_time, elapsed_time):
'''Write results to the file, using the given task name as the
Expand Down
22 changes: 22 additions & 0 deletions dammit/tasks/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,28 @@ def get_cat_task(file_list, target_fn):
'targets': [target_fn],
'clean': [clean_targets]}

@doit_task
def get_unexisting_folder_cat_task(file_dir, target_fn):
'''Create a doit task to `cat` together all .fs files in folder
result to the given target.

Args:
file_dir (str): The directory containing files to `cat`.
target_fn (str): The target file.

Returns:
dict: A doit task.
'''
file_list = file_dir+"/*.fs"

cmd = 'cat {files} > {t}'.format(files=file_list, t=target_fn)

return {'name': 'cat:' + os.path.basename(target_fn),
'actions': [cmd],
'targets': [target_fn],
'uptodate': [run_once],
'clean': [clean_targets]}


@doit_task
def get_copy_file_task(src, dst):
Expand Down
2 changes: 1 addition & 1 deletion dammit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def __enter__(self):
if self.verbose:
print('Move to `{0}` from cwd: `{1}`'.format(self.target,
self.cwd,
file=sys.stderr))
file=sys.stderr),flush=True)
if self.create:
try:
os.mkdir(self.target)
Expand Down