From 3ab13a95c6a24faf7c8d5d428b7173b94b7b1a06 Mon Sep 17 00:00:00 2001 From: Richard Meitern Date: Thu, 7 Feb 2019 17:23:10 +0200 Subject: [PATCH 1/7] test bruteforce OrtoDB update --- dammit/databases.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dammit/databases.json b/dammit/databases.json index 5b17a5fb..98b8a87a 100644 --- a/dammit/databases.json +++ b/dammit/databases.json @@ -50,21 +50,21 @@ "OrthoDB": { "access": "download", "db_type": "prot", - "filename": "aa_seq_euk.fasta", - "md5": "f40da35b9135c5f326380089f0ddb7a2", + "filename": "odb10_arthropoda.fasta", + "md5": "65414490cfe2073f78d7757e3a34607e", "fileformat": "gz", - "version": 8, - "url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/FASTA/aa_seq_euk.fasta.gz" + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_arthropoda_fasta.tar.gz" }, "orthodb_genes": { "access": "download", "db_type": "txt", - "filename": "ODB8_EukOGs_genes_ALL_levels.txt", - "md5": null, + "filename": "odb10v0_genes.tab", + "md5": "34b023f5334b124ed846ae526fbac8fc", "fileformat": "gz", - "version": 8, - "url": "ftp://cegg.unige.ch/OrthoDB8/Eukaryotes/Genes_to_OGs/ODB8_EukOGs_genes_ALL_levels.txt.gz" + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_genes.tab.gz" }, "BUSCO": { From cf43e988eea5545f8325d34381fe4ccf9b50ea33 Mon Sep 17 00:00:00 2001 From: Richard Meitern Date: Fri, 8 Feb 2019 12:50:18 +0200 Subject: [PATCH 2/7] added print flush, removed md5 --- dammit/annotate.py | 8 ++++---- dammit/app.py | 8 ++++---- dammit/databases.py | 14 +++++++------- dammit/handler.py | 12 ++++++------ dammit/log.py | 2 +- dammit/profile.py | 2 +- dammit/utils.py | 2 +- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dammit/annotate.py b/dammit/annotate.py index adf0d404..cec66d01 100644 --- a/dammit/annotate.py +++ b/dammit/annotate.py @@ -99,17 +99,17 @@ def run_annotation(handler): Args: handler (handler.TaskHandler): Handler with tasks for the pipeline. ''' - print(ui.header('Annotation', level=3)) - print(ui.header('Info', level=4)) + print(ui.header('Annotation', level=3),flush=True) + print(ui.header('Info', level=4),flush=True) info = {'Doit Database': handler.dep_file, 'Input Transcriptome': handler.files['transcriptome']} - print(ui.listing(info)) + print(ui.listing(info),flush=True) msg = '*All annotation tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) if not uptodate: return handler.run() else: - print('**Pipeline is already completed!**') + print('**Pipeline is already completed!**',flush=True) sys.exit(0) diff --git a/dammit/app.py b/dammit/app.py index e8fc3f39..20accce4 100644 --- a/dammit/app.py +++ b/dammit/app.py @@ -37,11 +37,11 @@ def __init__(self, arg_src=sys.argv[1:]): self.config_d.update(vars(self.args)) def run(self): - print(ui.header('dammit')) + print(ui.header('dammit'),flush=True) print(ui.header(__description__, level=2)) about = '\nby {0}\n\n**v{1}**, {2}\n'.format(', '.join(__authors__), __version__, __date__) - print(about) + print(about,flush=True) return self.args.func() def description(self): @@ -273,7 +273,7 @@ def handle_migrate(self): def handle_databases(self): log.start_logging() - print(ui.header('submodule: databases', level=2)) + print(ui.header('submodule: databases', level=2),flush=True) handler = databases.get_handler(self.config_d) if self.args.quick: @@ -293,7 +293,7 @@ def handle_databases(self): def handle_annotate(self): log.start_logging() - print(ui.header('submodule: annotate', level=2)) + print(ui.header('submodule: annotate', level=2),flush=True) db_handler = databases.get_handler(self.config_d) diff --git a/dammit/databases.py b/dammit/databases.py index ce12f47c..75b03253 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -76,25 +76,25 @@ def print_meta(handler): handler (handler.TaskHandler): The database task handler. ''' - print(ui.header('Info', level=4)) + print(ui.header('Info', level=4),flush=True) info = {'Doit Database': handler.dep_file, 'Database Directory': handler.directory} - print(ui.listing(info)) + print(ui.listing(info),flush=True) def install(handler): '''Run the database prep pipeline from the given handler. ''' - print(ui.header('Database Install', level=3)) + print(ui.header('Database Install', level=3),flush=True) print_meta(handler) msg = '*All database tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) if not uptodate: - print('Installing...') + print('Installing...',flush=True) return handler.run() else: - print('Nothing to install!') + print('Nothing to install!',flush=True) return 0 @@ -103,7 +103,7 @@ def check_or_fail(handler): with status 2. ''' - print(ui.header('Database Check', level=3)) + print(ui.header('Database Check', level=3),flush=True) print_meta(handler) msg = '*All database tasks up-to-date.*' uptodate, statuses = handler.print_statuses(uptodate_msg=msg) @@ -113,7 +113,7 @@ def check_or_fail(handler): ' already installed them, make sure you\'ve given' ' the correct location to `--database-dir` or have' ' exported the $DAMMIT_DB_DIR environment' - ' variable.')) + ' variable.'),flush=True) sys.exit(2) diff --git a/dammit/handler.py b/dammit/handler.py index 230599d8..3406e299 100644 --- a/dammit/handler.py +++ b/dammit/handler.py @@ -159,17 +159,17 @@ def print_statuses(self, uptodate_msg='All tasks up-to-date!', uptodate, statuses = self.check_uptodate() if uptodate: - print(ui.paragraph(uptodate_msg)) + print(ui.paragraph(uptodate_msg),flush=True) else: print(ui.paragraph(outofdate_msg)) uptodate_list = [t for t,s in statuses.items() if s is True] outofdate_list = [t for t,s in statuses.items() if s is False] if uptodate_list: - print('\nUp-to-date tasks:') - print(ui.listing(uptodate_list)) + print('\nUp-to-date tasks:',flush=True) + print(ui.listing(uptodate_list),flush=True) if outofdate_list: - print('\nOut-of-date tasks:') - print(ui.listing(outofdate_list)) + print('\nOut-of-date tasks:',flush=True) + print(ui.listing(outofdate_list),flush=True) return uptodate, statuses def check_uptodate(self): @@ -207,7 +207,7 @@ def run(self, doit_args=None, verbose=True): int: Exit status of the doit command. ''' if verbose: - print(ui.header('Run Tasks', level=4)) + print(ui.header('Run Tasks', level=4),flush=True) if doit_args is None: doit_args = ['run'] if self.n_threads > 1: diff --git a/dammit/log.py b/dammit/log.py index 098b918a..65b2b81a 100644 --- a/dammit/log.py +++ b/dammit/log.py @@ -35,7 +35,7 @@ def run(filename=None, test=False): filename = log_file if test is True: filename = os.path.join(log_dir, 'dammit-tests.log') - print('Logger in testing mode:', filename) + print('Logger in testing mode:', filename,flush=True) logging.basicConfig(level=logging.DEBUG, **config) run_handler = logging.FileHandler(filename) diff --git a/dammit/profile.py b/dammit/profile.py index 0d94f107..a93ebe65 100644 --- a/dammit/profile.py +++ b/dammit/profile.py @@ -44,7 +44,7 @@ def start_profiler(self, filename=None, blockname='__main__'): self.blockname = blockname self.running = True self.lock = filelock.FileLock('{0}.lock'.format(self.filename)) - print('Profiling is ON:', self.filename, '\n', file=sys.stderr) + print('Profiling is ON:', self.filename, '\n', file=sys.stderr,flush=True) def write_result(self, task_name, start_time, end_time, elapsed_time): '''Write results to the file, using the given task name as the diff --git a/dammit/utils.py b/dammit/utils.py index d0781283..21f92d2c 100644 --- a/dammit/utils.py +++ b/dammit/utils.py @@ -122,7 +122,7 @@ def __enter__(self): if self.verbose: print('Move to `{0}` from cwd: `{1}`'.format(self.target, self.cwd, - file=sys.stderr)) + file=sys.stderr),flush=True) if self.create: try: os.mkdir(self.target) From 8e7e9eb535a63660c555f26d96569a3382113ff9 Mon Sep 17 00:00:00 2001 From: kasutaja Date: Mon, 11 Feb 2019 16:57:12 +0200 Subject: [PATCH 3/7] replaced OrthoDBv8 with OrthoDBv10 --- dammit/app.py | 9 +++++++++ dammit/config.json | 5 +++++ dammit/databases.json | 44 ++++++++++++++++++++++++++++++++++++------- dammit/databases.py | 39 ++++++++++++++++++++++---------------- dammit/tasks/shell.py | 22 ++++++++++++++++++++++ 5 files changed, 96 insertions(+), 23 deletions(-) diff --git a/dammit/app.py b/dammit/app.py index 20accce4..62a98538 100644 --- a/dammit/app.py +++ b/dammit/app.py @@ -91,6 +91,15 @@ def add_common_args(parser): ' Full list of options is below.' ) + parser.add_argument('--orthodb-group', + default='metazoa', + metavar='[metazoa, arthropoda, vertebrata, ...]', + choices=list(self.databases_d['OrthoDB'].keys()), + help='Which orthoDB group to use. Should be chosen'\ + ' based on the organism being annotated.'\ + ' Full list of options is below.' + ) + parser.add_argument('--n_threads', type=int, default=1, diff --git a/dammit/config.json b/dammit/config.json index aae66d54..cb182730 100644 --- a/dammit/config.json +++ b/dammit/config.json @@ -21,6 +21,11 @@ "output_suffix": ".busco.results", "params": [] }, + "orthodb": { + "db_dir": "orthoDBv10", + "output_suffix": ".ortho.results", + "params": [] + }, "hmmer": { "hmmscan": [], diff --git a/dammit/databases.json b/dammit/databases.json index 98b8a87a..de5a8798 100644 --- a/dammit/databases.json +++ b/dammit/databases.json @@ -48,13 +48,43 @@ }, "OrthoDB": { - "access": "download", - "db_type": "prot", - "filename": "odb10_arthropoda.fasta", - "md5": "65414490cfe2073f78d7757e3a34607e", - "fileformat": "gz", - "version": 10, - "url": "https://v100.orthodb.org/download/odb10_arthropoda_fasta.tar.gz" + "arthropoda":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_arthropoda", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_arthropoda_fasta.tar.gz" + }, + "metazoa":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_metazoa", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_metazoa_fasta.tar.gz" + }, + "vertebrata":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_vertebrata", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_vertebrata_fasta.tar.gz" + }, + "protozoa":{ + "access": "download", + "db_type": "prot", + "folder": "odb10_protozoa", + "md5": null, + "fileformat": "tar.gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10_protozoa_fasta.tar.gz" + } + }, "orthodb_genes": { diff --git a/dammit/databases.py b/dammit/databases.py index 75b03253..5fb6f81f 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -18,6 +18,7 @@ from dammit.tasks.infernal import CMPressTask from dammit.tasks.shell import (get_download_and_gunzip_task, get_download_and_untar_task, + get_unexisting_folder_cat_task, get_download_task, get_gunzip_task) @@ -134,7 +135,7 @@ def build_default_pipeline(handler, config, databases, with_uniref=False, with_n register_pfam_tasks(handler, config['hmmer']['hmmpress'], databases) register_rfam_tasks(handler, config['infernal']['cmpress'], databases) - register_orthodb_tasks(handler, config['last']['lastdb'], databases) + register_orthodb_tasks(handler, config, databases) register_busco_tasks(handler, config, databases) register_sprot_tasks(handler, config['last']['lastdb'], databases) if with_uniref: @@ -193,26 +194,32 @@ def register_rfam_tasks(handler, params, databases): return handler -def register_orthodb_tasks(handler, params, databases): +def register_orthodb_tasks(handler, config, databases): + #Prepare variables + last_db_params = config['last']['lastdb'] orthodb = databases['OrthoDB'] - archive_fn = '{0}.{1}'.format(orthodb['filename'], - orthodb['fileformat']) - target_fn = path.join(handler.directory, orthodb['filename']) - - dl_task = get_download_task(orthodb['url'], - archive_fn, - md5=orthodb['md5']) - gz_task = get_gunzip_task(archive_fn, target_fn) - - handler.register_task('download:OrthoDB', dl_task, - files={'OrthoDB-gz': archive_fn}) - handler.register_task('gunzip:OrthoDB', gz_task, - files={'OrthoDB': target_fn}) + orthodb_dir = path.join(handler.directory, config['orthodb']['db_dir']) + group_name = config['orthodb-group'] #Hardcoded in config.json as app.py argument seems not to work + group = orthodb[group_name] + files = {'orthoDB-{0}'.format(group_name): path.join(orthodb_dir, group['folder'])} + + target_dir = orthodb_dir+"/"+group_name+"/Rawdata" + target_fn = orthodb_dir + "/"+ group_name+ "/"+ group_name+ ".fasta" + + dl_task = get_download_and_untar_task(group['url'], orthodb_dir, label=group_name) + + + cat_task = get_unexisting_folder_cat_task(target_dir,target_fn) + + + #Register tasks + handler.register_task('download_and_untar:OrthoDB', dl_task, files=files) + handler.register_task('join_files:OrthDB',cat_task, files={'OrthoDB': target_dir}) handler.register_task('lastdb:OrthoDB', LastDBTask().task(target_fn, target_fn, prot=True, - params=params)) + params=last_db_params)) return handler diff --git a/dammit/tasks/shell.py b/dammit/tasks/shell.py index 613dd39b..d52e9b4e 100644 --- a/dammit/tasks/shell.py +++ b/dammit/tasks/shell.py @@ -208,6 +208,28 @@ def get_cat_task(file_list, target_fn): 'targets': [target_fn], 'clean': [clean_targets]} +@doit_task +def get_unexisting_folder_cat_task(file_dir, target_fn): + '''Create a doit task to `cat` together all .fs files in folder + result to the given target. + + Args: + file_dir (str): The directory containing files to `cat`. + target_fn (str): The target file. + + Returns: + dict: A doit task. + ''' + file_list = file_dir+"/*.fs" + + cmd = 'cat {files} > {t}'.format(files=file_list, t=target_fn) + + return {'name': 'cat:' + os.path.basename(target_fn), + 'actions': [cmd], + 'targets': [target_fn], + 'uptodate': [run_once], + 'clean': [clean_targets]} + @doit_task def get_copy_file_task(src, dst): From 332b9e93d2c42222bbb63d97e15ee873c937fbf7 Mon Sep 17 00:00:00 2001 From: kasutaja Date: Mon, 11 Feb 2019 18:03:02 +0200 Subject: [PATCH 4/7] updated version fixed orthoDB comand line argument name --- LICENSE | 3 +++ dammit/VERSION | 2 +- dammit/databases.py | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index a822c4d8..481a6bc9 100644 --- a/LICENSE +++ b/LICENSE @@ -4,6 +4,9 @@ dammit -- a simple de novo transcriptome annotator Copyright (C) 2015-2018 Camille Scott +Copyright (C) 2019 Richard Meitern + + All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/dammit/VERSION b/dammit/VERSION index 9459d4ba..5625e59d 100644 --- a/dammit/VERSION +++ b/dammit/VERSION @@ -1 +1 @@ -1.1 +1.2 diff --git a/dammit/databases.py b/dammit/databases.py index 5fb6f81f..3c1b3b4e 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -199,7 +199,7 @@ def register_orthodb_tasks(handler, config, databases): last_db_params = config['last']['lastdb'] orthodb = databases['OrthoDB'] orthodb_dir = path.join(handler.directory, config['orthodb']['db_dir']) - group_name = config['orthodb-group'] #Hardcoded in config.json as app.py argument seems not to work + group_name = config['orthodb_group'] group = orthodb[group_name] files = {'orthoDB-{0}'.format(group_name): path.join(orthodb_dir, group['folder'])} @@ -215,6 +215,7 @@ def register_orthodb_tasks(handler, config, databases): #Register tasks handler.register_task('download_and_untar:OrthoDB', dl_task, files=files) handler.register_task('join_files:OrthDB',cat_task, files={'OrthoDB': target_dir}) + #TODO remove target_dir handler.register_task('lastdb:OrthoDB', LastDBTask().task(target_fn, target_fn, From 8c16212e23041e7e6ec6fa50c8390d99bd81f64f Mon Sep 17 00:00:00 2001 From: kasutaja Date: Thu, 14 Feb 2019 09:56:41 +0200 Subject: [PATCH 5/7] added TODOs changed folder lacations --- dammit/databases.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dammit/databases.py b/dammit/databases.py index 3c1b3b4e..87d72c70 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -195,6 +195,7 @@ def register_rfam_tasks(handler, params, databases): def register_orthodb_tasks(handler, config, databases): + #TODO get OrthoDBversion spesific #Prepare variables last_db_params = config['last']['lastdb'] orthodb = databases['OrthoDB'] @@ -203,19 +204,19 @@ def register_orthodb_tasks(handler, config, databases): group = orthodb[group_name] files = {'orthoDB-{0}'.format(group_name): path.join(orthodb_dir, group['folder'])} - target_dir = orthodb_dir+"/"+group_name+"/Rawdata" - target_fn = orthodb_dir + "/"+ group_name+ "/"+ group_name+ ".fasta" + target_dir = orthodb_dir + "/"+ group_name + "/"+ group_name + target_fn = orthodb_dir + "/"+ group_name+ "/"+ group_name dl_task = get_download_and_untar_task(group['url'], orthodb_dir, label=group_name) - - cat_task = get_unexisting_folder_cat_task(target_dir,target_fn) + + cat_task = get_unexisting_folder_cat_task(orthodb_dir + "/"+ group_name + '/Rawdata',target_fn) #Register tasks handler.register_task('download_and_untar:OrthoDB', dl_task, files=files) handler.register_task('join_files:OrthDB',cat_task, files={'OrthoDB': target_dir}) - #TODO remove target_dir + #TODO remove target_dir/Rawdata handler.register_task('lastdb:OrthoDB', LastDBTask().task(target_fn, target_fn, From 158104ab22b9fd65d677f56bac955c2502643069 Mon Sep 17 00:00:00 2001 From: kasutaja Date: Thu, 14 Feb 2019 12:27:06 +0200 Subject: [PATCH 6/7] added missing databasestto map odbGenes to odbIDs --- dammit/databases.json | 52 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/dammit/databases.json b/dammit/databases.json index de5a8798..36418200 100644 --- a/dammit/databases.json +++ b/dammit/databases.json @@ -48,6 +48,7 @@ }, "OrthoDB": { + "arthropoda":{ "access": "download", "db_type": "prot", @@ -83,19 +84,50 @@ "fileformat": "tar.gz", "version": 10, "url": "https://v100.orthodb.org/download/odb10_protozoa_fasta.tar.gz" - } + }, + + "genes": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_genes.tab", + "md5": "34b023f5334b124ed846ae526fbac8fc", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_genes.tab.gz" + }, + + "og2genes": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OG2genes.tab", + "md5": "d16c8a2272d8e35684279c4d6e0ed1a0", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OG2genes.tab.gz" + }, + "og2xrefs": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OG_xrefs.tab", + "md5": "6d1279b4e4819b5d95c2e0ec81a53417", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OG_xrefs.tab.gz" + }, + + "ogs": { + "access": "download", + "db_type": "txt", + "filename": "odb10v0_OGs.tab", + "md5": "b9d6cf78cfa401f2ee8c99627b95a8af", + "fileformat": "gz", + "version": 10, + "url": "https://v100.orthodb.org/download/odb10v0_OGs.tab.gz" + } }, - "orthodb_genes": { - "access": "download", - "db_type": "txt", - "filename": "odb10v0_genes.tab", - "md5": "34b023f5334b124ed846ae526fbac8fc", - "fileformat": "gz", - "version": 10, - "url": "https://v100.orthodb.org/download/odb10v0_genes.tab.gz" - }, + "BUSCO": { "deltaepsilonsub": { From fba935422cb0a95a8d3de25265a2f59712833725 Mon Sep 17 00:00:00 2001 From: Richard Meitern Date: Wed, 24 Apr 2019 15:47:13 +0300 Subject: [PATCH 7/7] vertebrate DB install fails, unexpected dir name --- dammit/databases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dammit/databases.py b/dammit/databases.py index 87d72c70..88945b00 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -209,7 +209,7 @@ def register_orthodb_tasks(handler, config, databases): dl_task = get_download_and_untar_task(group['url'], orthodb_dir, label=group_name) - + #TODO take the folder name from json not from the untar task as the folder that isa untar'ed might have a diffrent name (problem with vertebrata/e eg) cat_task = get_unexisting_folder_cat_task(orthodb_dir + "/"+ group_name + '/Rawdata',target_fn)