diff --git a/LICENSE b/LICENSE index a822c4d8..481a6bc9 100644 --- a/LICENSE +++ b/LICENSE @@ -4,6 +4,9 @@ dammit -- a simple de novo transcriptome annotator Copyright (C) 2015-2018 Camille Scott +Copyright (C) 2019 Richard Meitern + + All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/dammit/annotate.py b/dammit/annotate.py index cc68e684..fd7b519d 100644 --- a/dammit/annotate.py +++ b/dammit/annotate.py @@ -99,17 +99,17 @@ def run_annotation(handler): Args: handler (handler.TaskHandler): Handler with tasks for the pipeline. ''' - print(ui.header('Annotation', level=3)) - print(ui.header('Info', level=4)) + print(ui.header('Annotation', level=3),flush=True) + print(ui.header('Info', level=4),flush=True) info = {'Doit Database': handler.dep_file, 'Input Transcriptome': handler.files['transcriptome']} - print(ui.listing(info)) + print(ui.listing(info),flush=True) msg = '*All annotation tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) if not uptodate: return handler.run() else: - print('**Pipeline is already completed!**') + print('**Pipeline is already completed!**',flush=True) sys.exit(0) diff --git a/dammit/app.py b/dammit/app.py index e8fc3f39..62a98538 100644 --- a/dammit/app.py +++ b/dammit/app.py @@ -37,11 +37,11 @@ def __init__(self, arg_src=sys.argv[1:]): self.config_d.update(vars(self.args)) def run(self): - print(ui.header('dammit')) + print(ui.header('dammit'),flush=True) print(ui.header(__description__, level=2)) about = '\nby {0}\n\n**v{1}**, {2}\n'.format(', '.join(__authors__), __version__, __date__) - print(about) + print(about,flush=True) return self.args.func() def description(self): @@ -91,6 +91,15 @@ def add_common_args(parser): ' Full list of options is below.' ) + parser.add_argument('--orthodb-group', + default='metazoa', + metavar='[metazoa, arthropoda, vertebrata, ...]', + choices=list(self.databases_d['OrthoDB'].keys()), + help='Which orthoDB group to use. Should be chosen'\ + ' based on the organism being annotated.'\ + ' Full list of options is below.' + ) + parser.add_argument('--n_threads', type=int, default=1, @@ -273,7 +282,7 @@ def handle_migrate(self): def handle_databases(self): log.start_logging() - print(ui.header('submodule: databases', level=2)) + print(ui.header('submodule: databases', level=2),flush=True) handler = databases.get_handler(self.config_d) if self.args.quick: @@ -293,7 +302,7 @@ def handle_databases(self): def handle_annotate(self): log.start_logging() - print(ui.header('submodule: annotate', level=2)) + print(ui.header('submodule: annotate', level=2),flush=True) db_handler = databases.get_handler(self.config_d) diff --git a/dammit/config.json b/dammit/config.json index aae66d54..cb182730 100644 --- a/dammit/config.json +++ b/dammit/config.json @@ -21,6 +21,11 @@ "output_suffix": ".busco.results", "params": [] }, + "orthodb": { + "db_dir": "orthoDBv10", + "output_suffix": ".ortho.results", + "params": [] + }, "hmmer": { "hmmscan": [], diff --git a/dammit/databases.json b/dammit/databases.json index 5b17a5fb..36418200 100644 --- a/dammit/databases.json +++ b/dammit/databases.json @@ -48,24 +48,86 @@ }, "OrthoDB": { - "access": "download", - "db_type": "prot", - "filename": "aa_seq_euk.fasta", - "md5": "f40da35b9135c5f326380089f0ddb7a2", - "fileformat": "gz", - "version": 8, - "url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/FASTA/aa_seq_euk.fasta.gz" + + "arthropoda":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_arthropoda", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_arthropoda_fasta.tar.gz" + }, + "metazoa":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_metazoa", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_metazoa_fasta.tar.gz" + }, + "vertebrata":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_vertebrata", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_vertebrata_fasta.tar.gz" + }, + "protozoa":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_protozoa", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_protozoa_fasta.tar.gz" + }, + + "genes": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_genes.tab", + "md5": "34b023f5334b124ed846ae526fbac8fc", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_genes.tab.gz" + }, + + "og2genes": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OG2genes.tab", + "md5": "d16c8a2272d8e35684279c4d6e0ed1a0", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OG2genes.tab.gz" + }, + "og2xrefs": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OG_xrefs.tab", + "md5": "6d1279b4e4819b5d95c2e0ec81a53417", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OG_xrefs.tab.gz" + }, + + "ogs": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OGs.tab", + "md5": "b9d6cf78cfa401f2ee8c99627b95a8af", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OGs.tab.gz" + } + }, - "orthodb_genes": { - "access": "download", - "db_type": "txt", - "filename": "ODB8_EukOGs_genes_ALL_levels.txt", - "md5": null, - "fileformat": "gz", - "version": 8, - "url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/Genes_to_OGs/ODB8_EukOGs_genes_ALL_levels.txt.gz" - }, + "BUSCO": { "deltaepsilonsub": { diff --git a/dammit/databases.py b/dammit/databases.py index ce12f47c..88945b00 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -18,6 +18,7 @@ from dammit.tasks.infernal import CMPressTask from dammit.tasks.shell import (get_download_and_gunzip_task, get_download_and_untar_task, + get_unexisting_folder_cat_task, get_download_task, get_gunzip_task) @@ -76,25 +77,25 @@ def print_meta(handler): handler (handler.TaskHandler): The database task handler. ''' - print(ui.header('Info', level=4)) + print(ui.header('Info', level=4),flush=True) info = {'Doit Database': handler.dep_file, 'Database Directory': handler.directory} - print(ui.listing(info)) + print(ui.listing(info),flush=True) def install(handler): '''Run the database prep pipeline from the given handler. ''' - print(ui.header('Database Install', level=3)) + print(ui.header('Database Install', level=3),flush=True) print_meta(handler) msg = '*All database tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) if not uptodate: - print('Installing...') + print('Installing...',flush=True) return handler.run() else: - print('Nothing to install!') + print('Nothing to install!',flush=True) return 0 @@ -103,7 +104,7 @@ def check_or_fail(handler): with status 2. ''' - print(ui.header('Database Check', level=3)) + print(ui.header('Database Check', level=3),flush=True) print_meta(handler) msg = '*All database tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) @@ -113,7 +114,7 @@ def check_or_fail(handler): ' already installed them, make sure you\'ve given' ' the correct location to `--database-dir` or have' ' exported the $DAMMIT_DB_DIR environment' - ' variable.')) + ' variable.'),flush=True) sys.exit(2) @@ -134,7 +135,7 @@ def build_default_pipeline(handler, config, databases, with_uniref=False, with_n register_pfam_tasks(handler, config['hmmer']['hmmpress'], databases) register_rfam_tasks(handler, config['infernal']['cmpress'], databases) - register_orthodb_tasks(handler, config['last']['lastdb'], databases) + register_orthodb_tasks(handler, config, databases) register_busco_tasks(handler, config, databases) register_sprot_tasks(handler, config['last']['lastdb'], databases) if with_uniref: @@ -193,26 +194,34 @@ def register_rfam_tasks(handler, params, databases): return handler -def register_orthodb_tasks(handler, params, databases): +def register_orthodb_tasks(handler, config, databases): + #TODO get OrthoDBversion spesific + #Prepare variables + last_db_params = config['last']['lastdb'] orthodb = databases['OrthoDB'] - archive_fn = '{0}.{1}'.format(orthodb['filename'], - orthodb['fileformat']) - target_fn = path.join(handler.directory, orthodb['filename']) - - dl_task = get_download_task(orthodb['url'], - archive_fn, - md5=orthodb['md5']) - gz_task = get_gunzip_task(archive_fn, target_fn) - - handler.register_task('download:OrthoDB', dl_task, - files={'OrthoDB-gz': archive_fn}) - handler.register_task('gunzip:OrthoDB', gz_task, - files={'OrthoDB': target_fn}) + orthodb_dir = path.join(handler.directory, config['orthodb']['db_dir']) + group_name = config['orthodb_group'] + group = orthodb[group_name] + files = {'orthoDB-{0}'.format(group_name): path.join(orthodb_dir, group['folder'])} + + target_dir = orthodb_dir + "/"+ group_name + "/"+ group_name + target_fn = orthodb_dir + "/"+ group_name+ "/"+ group_name + + dl_task = get_download_and_untar_task(group['url'], orthodb_dir, label=group_name) + + #TODO take the folder name from json not from the untar task as the folder that isa untar'ed might have a diffrent name (problem with vertebrata/e eg) + cat_task = get_unexisting_folder_cat_task(orthodb_dir + "/"+ group_name + '/Rawdata',target_fn) + + + #Register tasks + handler.register_task('download_and_untar:OrthoDB', dl_task, files=files) + handler.register_task('join_files:OrthDB',cat_task, files={'OrthoDB': target_dir}) + #TODO remove target_dir/Rawdata handler.register_task('lastdb:OrthoDB', LastDBTask().task(target_fn, target_fn, prot=True, - params=params)) + params=last_db_params)) return handler diff --git a/dammit/handler.py b/dammit/handler.py index 230599d8..3406e299 100644 --- a/dammit/handler.py +++ b/dammit/handler.py @@ -159,17 +159,17 @@ def print_statuses(self, uptodate_msg='All tasks up-to-date!', uptodate, statuses = self.check_uptodate() if uptodate: - print(ui.paragraph(uptodate_msg)) + print(ui.paragraph(uptodate_msg),flush=True) else: print(ui.paragraph(outofdate_msg)) uptodate_list = [t for t,s in statuses.items() if s is True] outofdate_list = [t for t,s in statuses.items() if s is False] if uptodate_list: - print('\nUp-to-date tasks:') - print(ui.listing(uptodate_list)) + print('\nUp-to-date tasks:',flush=True) + print(ui.listing(uptodate_list),flush=True) if outofdate_list: - print('\nOut-of-date tasks:') - print(ui.listing(outofdate_list)) + print('\nOut-of-date tasks:',flush=True) + print(ui.listing(outofdate_list),flush=True) return uptodate, statuses def check_uptodate(self): @@ -207,7 +207,7 @@ def run(self, doit_args=None, verbose=True): int: Exit status of the doit command. ''' if verbose: - print(ui.header('Run Tasks', level=4)) + print(ui.header('Run Tasks', level=4),flush=True) if doit_args is None: doit_args = ['run'] if self.n_threads > 1: diff --git a/dammit/log.py b/dammit/log.py index 098b918a..65b2b81a 100644 --- a/dammit/log.py +++ b/dammit/log.py @@ -35,7 +35,7 @@ def run(filename=None, test=False): filename = log_file if test is True: filename = os.path.join(log_dir, 'dammit-tests.log') - print('Logger in testing mode:', filename) + print('Logger in testing mode:', filename,flush=True) logging.basicConfig(level=logging.DEBUG, **config) run_handler = logging.FileHandler(filename) diff --git a/dammit/profile.py b/dammit/profile.py index 0d94f107..a93ebe65 100644 --- a/dammit/profile.py +++ b/dammit/profile.py @@ -44,7 +44,7 @@ def start_profiler(self, filename=None, blockname='__main__'): self.blockname = blockname self.running = True self.lock = filelock.FileLock('{0}.lock'.format(self.filename)) - print('Profiling is ON:', self.filename, '\n', file=sys.stderr) + print('Profiling is ON:', self.filename, '\n', file=sys.stderr,flush=True) def write_result(self, task_name, start_time, end_time, elapsed_time): '''Write results to the file, using the given task name as the diff --git a/dammit/tasks/shell.py b/dammit/tasks/shell.py index 613dd39b..d52e9b4e 100644 --- a/dammit/tasks/shell.py +++ b/dammit/tasks/shell.py @@ -208,6 +208,28 @@ def get_cat_task(file_list, target_fn): 'targets': [target_fn], 'clean': [clean_targets]} +@doit_task +def get_unexisting_folder_cat_task(file_dir, target_fn): + '''Create a doit task to `cat` together all .fs files in folder + result to the given target. + + Args: + file_dir (str): The directory containing files to `cat`. + target_fn (str): The target file. + + Returns: + dict: A doit task. + ''' + file_list = file_dir+"/*.fs" + + cmd = 'cat {files} > {t}'.format(files=file_list, t=target_fn) + + return {'name': 'cat:' + os.path.basename(target_fn), + 'actions': [cmd], + 'targets': [target_fn], + 'uptodate': [run_once], + 'clean': [clean_targets]} + @doit_task def get_copy_file_task(src, dst): diff --git a/dammit/utils.py b/dammit/utils.py index d0781283..21f92d2c 100644 --- a/dammit/utils.py +++ b/dammit/utils.py @@ -122,7 +122,7 @@ def __enter__(self): if self.verbose: print('Move to `{0}` from cwd: `{1}`'.format(self.target, self.cwd, - file=sys.stderr)) + file=sys.stderr),flush=True) if self.create: try: os.mkdir(self.target)