diff --git a/.envrc b/.envrc index 740d348d..a6e8f579 100644 --- a/.envrc +++ b/.envrc @@ -1,6 +1,7 @@ export FLASK_APP=main:app export GAE_VERSION=development-`cat .travis.yml | grep 'VERSION_NUM=' | cut -f 2 -d '='` -export GOOGLE_APPLICATION_CREDENTIALS=env_config/client-secret.json +export CLOUD_CONFIG=1 +export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/env_config/client-secret.json export PYTHONPATH=$(pwd) export WERKZEUG_DEBUG_PIN=off export TEMPLATES_AUTO_RELOAD=1 diff --git a/.gitignore b/.gitignore index 533f72d4..7490bb8e 100644 --- a/.gitignore +++ b/.gitignore @@ -61,7 +61,7 @@ target/ #Random *.DS_Store -*.json +package-lock.json tmp/ @@ -72,6 +72,10 @@ mapping_worker/.envrc env_config/ env_config.zip +env_config_mti/ +env_config_mti.zip +env_config_al/ +env_config_al.zip .python-version @@ -92,4 +96,10 @@ photos/* *.done # Heritability run go tool -invoke \ No newline at end of file +invoke + +.vscode/launch.json +cloud_functions/heritability_run/strain_data.tsv +base/bam_bai_signed_download_script.sh + +uploads/ diff --git a/.gitmodules b/.gitmodules index e69de29b..484b7775 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,18 @@ +[submodule "external/generate-thumbnails"] + path = external/generate-thumbnails + url = https://github.com/AndersenLab/generate-thumbnails +[submodule "external/h2calc"] + path = external/h2calc + url = https://github.com/AndersenLab/h2calc +[submodule "external/nscalc"] + path = external/nscalc + url = https://github.com/AndersenLab/nscalc +[submodule "external/ipcalc"] + path = external/ipcalc + url = https://github.com/AndersenLab/ipcalc +[submodule "external/dockerfile"] + path = external/dockerfile + url = https://github.com/AndersenLab/dockerfile +[submodule "external/NemaScan"] + path = external/NemaScan + url = https://github.com/AndersenLab/NemaScan diff --git a/.travis-al.yml b/.travis-al.yml new file mode 100644 index 00000000..56497392 --- /dev/null +++ b/.travis-al.yml @@ -0,0 +1,29 @@ +language: bash + +env: + - GOOGLE_CLOUD_BUCKET="elegansvariation.org" GOOGLE_CLOUD_PROJECT_ID="andersen-lab" + +install: +- openssl aes-256-cbc -K $encrypted_f7a2e30d9b29_key -iv $encrypted_f7a2e30d9b29_iv -in env_config_al.zip.enc -out env_config.zip -d +- unzip -qo env_config.zip +- mv env_config_al env_config +- export VERSION_NUM=1-5-9 +- export APP_CONFIG=master +- export CLOUD_CONFIG=1 +- if [ "${TRAVIS_BRANCH}" != "master" ]; then export APP_CONFIG=development; fi; +- export GAE_VERSION=${APP_CONFIG}-${VERSION_NUM} +- export GOOGLE_APPLICATION_CREDENTIALS=env_config/client-secret.json +- export GOOGLE_CLOUD_BUCKET=${GOOGLE_CLOUD_BUCKET} +- export GOOGLE_CLOUD_PROJECT_ID=${GOOGLE_CLOUD_PROJECT_ID} + + +deploy: + provider: gae + version: "${GAE_VERSION}" + project: "${GOOGLE_CLOUD_PROJECT_ID}" + keyfile: env_config/client-secret.json + on: + all_branches: true + no_promote: true + no_stop_previous_version: true + skip_cleanup: true diff --git a/.travis-mti.yml b/.travis-mti.yml new file mode 100644 index 00000000..5481654f --- /dev/null +++ b/.travis-mti.yml @@ -0,0 +1,28 @@ +language: bash + +env: + - GOOGLE_CLOUD_BUCKET="elegansvariation" GOOGLE_CLOUD_PROJECT_ID="andersen-lab-302418" + +install: +- openssl aes-256-cbc -K $encrypted_eb81f51f2e9b_key -iv $encrypted_eb81f51f2e9b_iv -in env_config_mti.zip.enc -out env_config.zip -d +- unzip -qo env_config.zip +- mv env_config_mti env_config +- export VERSION_NUM=1-5-9 +- export APP_CONFIG=master +- export CLOUD_CONFIG=1 +- if [ "${TRAVIS_BRANCH}" != "master" ]; then export APP_CONFIG=development; fi; +- export GAE_VERSION=${APP_CONFIG}-${VERSION_NUM} +- export GOOGLE_APPLICATION_CREDENTIALS=env_config/client-secret.json +- export GOOGLE_CLOUD_BUCKET=${GOOGLE_CLOUD_BUCKET} +- export GOOGLE_CLOUD_PROJECT_ID=${GOOGLE_CLOUD_PROJECT_ID} + +deploy: + provider: gae + version: "${GAE_VERSION}" + project: "${GOOGLE_CLOUD_PROJECT_ID}" + keyfile: env_config/client-secret.json + on: + all_branches: true + no_promote: true + no_stop_previous_version: true + skip_cleanup: true diff --git a/.travis.yml b/.travis.yml index 9d3e3e97..817b1625 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,18 +1,24 @@ language: bash +env: + - GOOGLE_CLOUD_BUCKET="elegansvariation.org" GOOGLE_CLOUD_PROJECT_ID="andersen-lab" + install: -- openssl aes-256-cbc -K $encrypted_53077b9a3e95_key -iv $encrypted_53077b9a3e95_iv -in env_config.zip.enc -out env_config.zip -d +- openssl aes-256-cbc -K $encrypted_86f5a1ab1ccf_key -iv $encrypted_86f5a1ab1ccf_iv -in env_config.zip.enc -out env_config.zip -d - unzip -qo env_config.zip -- export VERSION_NUM=1-5-3 -- export APP_CONFIG=master +- export VERSION_NUM=9-9-9-9 +- export APP_CONFIG=development +- export CLOUD_CONFIG=1 - if [ "${TRAVIS_BRANCH}" != "master" ]; then export APP_CONFIG=development; fi; - export GAE_VERSION=${APP_CONFIG}-${VERSION_NUM} - export GOOGLE_APPLICATION_CREDENTIALS=env_config/client-secret.json +- export GOOGLE_CLOUD_BUCKET=${GOOGLE_CLOUD_BUCKET} +- export GOOGLE_CLOUD_PROJECT_ID=${GOOGLE_CLOUD_PROJECT_ID} deploy: provider: gae version: "${GAE_VERSION}" - project: andersen-lab + project: "${GOOGLE_CLOUD_PROJECT_ID}" keyfile: env_config/client-secret.json on: all_branches: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3a474210..3e6e89b0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ ## Making Changes -* Propose an issue or change you would like ot make using the issue tracker. +* Propose an issue or change you would like to make using the issue tracker. * Create a fork of the development branch. Once you have completed your work, create a pull request. * Rebase the latest development branch changes if updates were made in the interim. -* email danielecook@gmail.com if you have further questions. +* email danielecook@gmail.com or sam.wachspress@gmail.com if you have further questions. diff --git a/Dockerfile b/Dockerfile index 3312cf15..0a52cb5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,11 @@ tabix \ graphviz \ libgraphviz-dev \ pkg-config \ +libxml2 \ +xmlsec1 \ +libxml2-dev \ +libxmlsec1-dev \ +libxmlsec1-openssl \ && rm -rf /var/lib/apt/lists/* ENV BCFTOOLS_BIN="bcftools-1.10.tar.bz2" \ @@ -56,6 +61,6 @@ ADD . /app RUN FLASK_APP=main:app GAE_VERSION=blank-blank flask # Download the database; GAE_VERSION set as dummy variable -RUN FLASK_APP=main:app GAE_VERSION=blank-blank flask download_db +# RUN FLASK_APP=main:app GAE_VERSION=blank-blank flask download_db CMD gunicorn -b :$PORT main:app \ No newline at end of file diff --git a/app.yaml b/app.yaml index 8add486c..688f422d 100644 --- a/app.yaml +++ b/app.yaml @@ -1,6 +1,10 @@ +beta_settings: + cloud_sql_instances: andersen-lab:us-central1:cendr + runtime: custom +service: dev env: flex -entrypoint: gunicorn -b :$PORT main:app +entrypoint: gunicorn -b :$PORT main:app --ssl-version TLSv1_2 runtime_config: python_version: 3 @@ -32,15 +36,15 @@ resources: liveness_check: path: "/liveness_check" - check_interval_sec: 30 - timeout_sec: 4 + check_interval_sec: 60 + timeout_sec: 10 failure_threshold: 2 success_threshold: 2 readiness_check: path: "/readiness_check" - check_interval_sec: 5 - timeout_sec: 4 + check_interval_sec: 120 + timeout_sec: 10 failure_threshold: 2 success_threshold: 2 app_start_timeout_sec: 300 \ No newline at end of file diff --git a/base/application.py b/base/application.py index 5de26672..84db0220 100644 --- a/base/application.py +++ b/base/application.py @@ -1,31 +1,40 @@ +from datetime import datetime import os import json import requests + from os.path import basename -from base.config import config from flask import Flask, render_template from flask_wtf.csrf import CSRFProtect -from base.utils.text_utils import render_markdown from werkzeug.middleware.proxy_fix import ProxyFix from werkzeug.exceptions import HTTPException + +from base.constants import GOOGLE_CLOUD_BUCKET +from base.config import config +from base.utils.text_utils import render_markdown from base.manage import (initdb, update_strains, update_credentials, - decrypt_credentials, - download_db) + decrypt_credentials) # --------- # # Routing # # --------- # from base.views.about import about_bp from base.views.primary import primary_bp -from base.views.strains import strain_bp +from base.views.strains import strains_bp from base.views.order import order_bp from base.views.data import data_bp from base.views.mapping import mapping_bp from base.views.gene import gene_bp from base.views.user import user_bp +from base.views.maintenance import maintenance_bp +from base.views.admin.admin import admin_bp +from base.views.admin.users import users_bp +from base.views.admin.data import data_admin_bp + + # Tools from base.views.tools import (tools_bp, @@ -42,9 +51,9 @@ from base.views.api.api_data import api_data_bp # Auth -from base.auth import (auth_bp, - google_bp, - github_bp) +from base.views.auth import (auth_bp, + google_bp, + saml_bp) # ---- End Routing ---- # @@ -54,7 +63,8 @@ cache, debug_toolbar, sslify, - sqlalchemy) + sqlalchemy, + jwt) # Template filters from base.filters import (comma, format_release) @@ -86,7 +96,7 @@ def configure_ssl(app): # Running on server app.debug = False # Ignore leading slash of urls; skips must use start of path - sslify(app) + sslify(app, skips=['tasks']) elif app.config['DEBUG']: debug_toolbar(app) app.config['PRESERVE_CONTEXT_ON_EXCEPTION'] = True @@ -97,8 +107,7 @@ def register_commands(app): for command in [initdb, update_strains, update_credentials, - decrypt_credentials, - download_db]: + decrypt_credentials]: app.cli.add_command(command) @@ -110,20 +119,26 @@ def register_template_filters(app): def register_extensions(app): markdown(app) cache.init_app(app, config={'CACHE_TYPE': 'base.utils.cache.datastore_cache'}) - sqlalchemy(app) - CSRFProtect(app) - app.config['csrf'] = CSRFProtect(app) - + sqlalchemy.init_app(app) + # protect all routes (except the ones listed) from cross site request forgery + csrf = CSRFProtect(app) + csrf.exempt(auth_bp) + csrf.exempt(saml_bp) + csrf.exempt(maintenance_bp) + app.config['csrf'] = csrf + jwt.init_app(app) def register_blueprints(app): """Register blueprints with the Flask application.""" app.register_blueprint(primary_bp, url_prefix='') app.register_blueprint(about_bp, url_prefix='/about') - app.register_blueprint(strain_bp, url_prefix='/strain') + app.register_blueprint(strains_bp, url_prefix='/strains') app.register_blueprint(order_bp, url_prefix='/order') app.register_blueprint(data_bp, url_prefix='/data') app.register_blueprint(mapping_bp, url_prefix='') app.register_blueprint(gene_bp, url_prefix='/gene') + + # User app.register_blueprint(user_bp, url_prefix='/user') # Tools @@ -138,33 +153,51 @@ def register_blueprints(app): app.register_blueprint(api_data_bp, url_prefix='/api') # Auth - app.register_blueprint(auth_bp, url_prefix='') + app.register_blueprint(auth_bp, url_prefix='/auth') + app.register_blueprint(saml_bp, url_prefix='/saml') app.register_blueprint(google_bp, url_prefix='/login') - app.register_blueprint(github_bp, url_prefix='/login') - # Healthchecks + # Admin + app.register_blueprint(admin_bp, url_prefix='/admin') + app.register_blueprint(users_bp, url_prefix='/admin/users') + app.register_blueprint(data_admin_bp, url_prefix='/admin/data') + + # Healthchecks/Maintenance + app.register_blueprint(maintenance_bp, url_prefix='/tasks') app.register_blueprint(check_bp, url_prefix='') def gs_static(url, prefix='static'): - return f"https://storage.googleapis.com/elegansvariation.org/{prefix}/{url}" + return f"https://storage.googleapis.com/{GOOGLE_CLOUD_BUCKET}/{prefix}/{url}" def configure_jinja(app): # Injects "contexts" into templates @app.context_processor def inject(): - return dict(version=os.environ.get("GAE_VERSION", "-9-9-9").split("-", 1)[1].replace("-", "."), - json=json, - list=list, - str=str, - int=int, - len=len, - gs_static=gs_static, - basename=basename, - render_markdown=render_markdown) - - + return dict(version=os.environ.get("GAE_VERSION", "-9-9-9").split("-", 1)[1].replace("-", "."), + json=json, + list=list, + str=str, + int=int, + len=len, + gs_static=gs_static, + basename=basename, + render_markdown=render_markdown) + + # Datetime filters for Jinja + @app.template_filter('date_format') + def _jinja2_filter_datetime(date, fmt=None): + if fmt: + return date.strftime(fmt) + else: + return date.strftime('%c') + +''' +2021-04-14 17:26:51.348674+00:00 + +'%Y-%m-%d %H:%M:%S.%f+%z' +''' def register_errorhandlers(app): def render_error(e="generic"): diff --git a/base/auth.py b/base/auth.py deleted file mode 100644 index d05f09c5..00000000 --- a/base/auth.py +++ /dev/null @@ -1,97 +0,0 @@ -import arrow -import os -from flask import (redirect, - render_template, - url_for, - session, - request, - flash) -from functools import wraps -from base.models import user_ds -from base.utils.data_utils import unique_id -from slugify import slugify -from logzero import logger - -from flask_dance.contrib.google import make_google_blueprint, google -from flask_dance.contrib.github import make_github_blueprint, github -from flask_dance.consumer import oauth_authorized - -from flask import Blueprint -auth_bp = Blueprint('auth', - __name__, - template_folder='') - -google_bp = make_google_blueprint(scope=["https://www.googleapis.com/auth/userinfo.profile", - "https://www.googleapis.com/auth/userinfo.email"], - offline=True) -github_bp = make_github_blueprint(scope="user:email") -# dropbox_bp = make_dropbox_blueprint() - - -@auth_bp.route("/login/select", methods=['GET']) -def choose_login(error=None): - # Relax scope for Google - if not session.get("login_referrer", "").endswith("/login/select"): - session["login_referrer"] = request.referrer - os.environ['OAUTHLIB_RELAX_TOKEN_SCOPE'] = "true" - VARS = {'page_title': 'Choose Login'} - if error: - flash(error, 'danger') - return render_template('login.html', **VARS) - - -@oauth_authorized.connect -def authorized(blueprint, token): - if google.authorized: - user_info = google.get("/oauth2/v2/userinfo") - assert user_info.ok - user_info = {'google': user_info.json()} - user_email = user_info['google']['email'].lower() - elif github.authorized: - user_emails = github.get("/user/emails") - user_email = [x for x in user_emails.json() if x['primary']][0]["email"].lower() - user_info = {'github': github.get('/user').json()} - user_info['github']['email'] = user_email - else: - flash("Error logging in!") - return redirect(url_for("auth.choose_login")) - - # Create or get existing user. - user = user_ds(user_email) - if not user._exists: - user.user_email = user_email - user.user_info = user_info - user.email_confirmation_code = unique_id() - user.user_id = unique_id()[0:8] - user.username = slugify("{}_{}".format(user_email.split("@")[0], unique_id()[0:4])) - - user.last_login = arrow.utcnow().datetime - user.save() - - session['user'] = user.to_dict() - logger.debug(session) - - flash("Successfully logged in!", 'success') - return redirect(session.get("login_referrer", url_for('primary.primary'))) - - -def login_required(f): - @wraps(f) - def func(*args, **kwargs): - if not session.get('user'): - logger.info(session) - with app.app_context(): - session['redirect_url'] = request.url - return redirect(url_for('auth.choose_login')) - return f(*args, **kwargs) - return func - - -@auth_bp.route('/logout') -def logout(): - """ - Logs the user out. - """ - session.clear() - flash("Successfully logged out", "success") - return redirect(request.referrer) diff --git a/base/cloud_config.py b/base/cloud_config.py new file mode 100644 index 00000000..5998467d --- /dev/null +++ b/base/cloud_config.py @@ -0,0 +1,203 @@ +# Application Cloud Configuration for Site Static Content hosted externally +import os +import shutil +import json + +from os import path +from logzero import logger +from google.oauth2 import service_account +from google.cloud import datastore, storage + +from base.constants import REPORT_V1_FILE_LIST, REPORT_V2_FILE_LIST, GOOGLE_CLOUD_BUCKET +from base.utils.data_utils import dump_json, unique_id + +class CloudConfig: + + ds_client = None + storage_client = None + kind = 'cloud-config' + default_cc = { 'releases' : [{'dataset': '20210121', 'wormbase': 'WS276', 'version': 'v2'}, + {'dataset': '20200815', 'wormbase': 'WS276', 'version': 'v2'}, + {'dataset': '20180527', 'wormbase': 'WS263', 'version': 'v1'}, + {'dataset': '20170531', 'wormbase': 'WS258', 'version': 'v1'}, + {'dataset': '20160408', 'wormbase': 'WS245', 'version': 'v1'}] } + + def __init__(self, name, cc=default_cc, kind_prefix='', local=True): + self.kind = '{}{}'.format(kind_prefix, self.kind) + self.name = name + self.filename = f"{name}.txt" + self.cc = cc + self.local = local + + def get_ds_client(self): + if not self.ds_client: + self.ds_client = datastore.Client(credentials=service_account.Credentials.from_service_account_file('env_config/client-secret.json')) + return self.ds_client + + def get_storage_client(self): + if not self.storage_client: + self.storage_client = storage.Client(credentials=service_account.Credentials.from_service_account_file('env_config/client-secret.json')) + return self.storage_client + + def download_file(self, name, fname): + client = self.get_storage_client() + bucket = client.get_bucket(GOOGLE_CLOUD_BUCKET) + blob = bucket.blob(name) + blob.download_to_file(open(fname, 'wb')) + + def ds_save(self): + data = {'cloud_config': self.cc} + m = datastore.Entity(key=self.get_ds_client().key(self.kind, self.name)) + for key, value in data.items(): + if isinstance(value, dict): + m[key] = 'JSON:' + dump_json(value) + else: + m[key] = value + logger.debug(f"store: {self.kind} - {self.name}") + self.get_ds_client().put(m) + + def ds_load(self): + """ Retrieves a cloud config object from datastore """ + result = self.get_ds_client().get(self.get_ds_client().key(self.kind, self.name)) + logger.debug(f"get: {self.kind} - {self.name}") + try: + result_out = {'_exists': True} + for k, v in result.items(): + if isinstance(v, str) and v.startswith("JSON:"): + result_out[k] = json.loads(v[5:]) + elif v: + result_out[k] = v + self.cc = result_out.get('cloud_config') + except AttributeError: + return None + + def file_load(self): + """ Retrieves a cloud config object from a local file """ + if path.exists(self.filename): + with open(self.filename) as json_file: + data = json.load(json_file) + cc = data.get('cloud_config') if data else None + self.cc = cc + + def file_save(self): + """ Saves a cloud config object to a local file """ + with open(self.filename, 'w') as outfile: + data = {'cloud_config': self.cc} + json.dump(data, outfile) + + def save(self): + if self.local: + self.file_save() + else: + self.ds_save() + + def load(self): + if self.local: + self.file_load() + else: + self.ds_load() + + def remove_release(self, dataset): + ''' Removes a data release from the cloud config object ''' + releases = self.cc['releases'] + for i, r in enumerate(releases): + if r['dataset'] == dataset: + del releases[i] + + self.cc['releases'] = releases + self.save() + + def remove_release_files(self, dataset): + ''' Removes files linked to a data release from the GAE server ''' + report_path = f"base/static/reports/{dataset}" + if os.path.exists(report_path): + shutil.rmtree(report_path) + + def remove_release_db(self, dataset, wormbase): + ''' Removes sqlite db linked to a data release from the GAE server ''' + db_path = f"base/cendr.{dataset}.{wormbase}.db" + os.remove(db_path) + + def add_release(self, dataset, wormbase, version): + ''' Adds a data release to the cloud config object ''' + releases = self.cc['releases'] + # remove dataset if there is an existing one in the config + for i, r in enumerate(releases): + if r['dataset'] == dataset: + del releases[i] + + releases = [{'dataset': dataset, 'wormbase': wormbase, 'version': version}] + releases + self.cc['releases'] = releases + self.save() + + def get_release_files(self, dataset, files, refresh=False): + ''' Downloads files linked to a data release from the cloud bucket to the GAE server''' + local_path = 'base/static/reports/{}'.format(dataset) + if os.path.exists(local_path): + if refresh == True: + shutil.rmtree(local_path) + else: + return + + os.makedirs(local_path) + name_str = 'data_reports/{}/{}' + fname_str = '{}/{}' + + try: + for n in files: + name = f"data_reports/{dataset}/{n}" + fname = f"{local_path}/{n}" + self.download_file(name=name, fname=fname) + except: + return None + return files + + def get_release_db(self, dataset, wormbase, refresh=False): + db_name = f"db/cendr.{dataset}.{wormbase}.db" + db_fname = f"base/cendr.{dataset}.{wormbase}.db" + if os.path.exists(db_fname): + if refresh == True: + os.remove(db_fname) + else: + return + + self.download_file(name=db_name, fname=db_fname) + return True + + def create_backup(self): + name = self.name + self.name = '{}_{}'.format(name, unique_id()) + self.save() + self.name = name + + def get_properties(self): + ''' Converts the cloud_config object into a format that matches the regular config object ''' + releases = self.cc['releases'] + RELEASES = [] + for r in releases: + RELEASES.append((r['dataset'], r['wormbase'])) + RELEASES.sort(reverse=True) + + # Set the most recent release + DATASET_RELEASE, WORMBASE_VERSION = RELEASES[0] + + return {'DATASET_RELEASE': DATASET_RELEASE, + 'WORMBASE_VERSION': WORMBASE_VERSION, + 'RELEASES': RELEASES} + + def get_external_content(self): + releases = self.cc['releases'] + current_release = releases[0] + + # get data reports + for r in releases: + files = [] + if r['version'] == 'v1': + files = REPORT_V1_FILE_LIST + elif r['version'] == 'v2': + files = REPORT_V2_FILE_LIST + self.get_release_files(r['dataset'], files, refresh=False) + + # get sqlite db + self.get_release_db(current_release['dataset'], current_release['wormbase'], refresh=False) + diff --git a/base/config.py b/base/config.py index 5c513097..c9e82dac 100644 --- a/base/config.py +++ b/base/config.py @@ -1,57 +1,79 @@ # Application Configuration import os import yaml + +from logzero import logger from base.utils.data_utils import json_encoder +from base.constants import DEFAULT_CLOUD_CONFIG +from base.cloud_config import CloudConfig + +# Whether or not to load config properties from cloud datastore +try: + CLOUD_CONFIG = os.environ['CLOUD_CONFIG'] +except: + CLOUD_CONFIG = 0 # CeNDR Version APP_CONFIG, CENDR_VERSION = os.environ['GAE_VERSION'].split("-", 1) if APP_CONFIG not in ['development', 'master']: - APP_CONFIG = 'development' + APP_CONFIG = 'development' CENDR_VERSION = CENDR_VERSION.replace("-", '.') # BUILDS AND RELEASES # The first release is the current release # (RELEASE, ANNOTATION_GENOME) -RELEASES = [("20200815", "WS276"), - ("20180527", "WS263"), - ("20170531", "WS258"), - ("20160408", "WS245")] +RELEASES = [("20210121", "WS276"), ("20200815", "WS276"), ("20180527", "WS263"), ("20170531", "WS258"), ("20160408", "WS245")] # The most recent release DATASET_RELEASE, WORMBASE_VERSION = RELEASES[0] -# SQLITE DATABASE -SQLITE_PATH = f"base/cendr.{DATASET_RELEASE}.{WORMBASE_VERSION}.db" - - def load_yaml(path): - return yaml.load(open(path), Loader=yaml.SafeLoader) - + return yaml.load(open(path), Loader=yaml.SafeLoader) # CONFIG def get_config(APP_CONFIG): - """Load all configuration information including - constants defined above. - - (BASE_VARS are the same regardless of whether we are debugging or in production) - """ - config = dict() - BASE_VARS = load_yaml("env_config/base.yaml") - APP_CONFIG_VARS = load_yaml(f"env_config/{APP_CONFIG}.yaml") - config.update(BASE_VARS) - config.update(APP_CONFIG_VARS) - # Add configuration variables - # Remove base prefix for SQLAlchemy as it is loaded - # from application folder - config["SQLALCHEMY_DATABASE_URI"] = f"sqlite:///{SQLITE_PATH}".replace("base/", "") - config['json_encoder'] = json_encoder - config.update({"CENDR_VERSION": CENDR_VERSION, - "APP_CONFIG": APP_CONFIG, - "DATASET_RELEASE": DATASET_RELEASE, - "WORMBASE_VERSION": WORMBASE_VERSION, - "RELEASES": RELEASES}) - return config - - -# Generate the configuration + """Load all configuration information including + constants defined above. + + (BASE_VARS are the same regardless of whether we are debugging or in production) + """ + config = dict() + BASE_VARS = load_yaml("env_config/base.yaml") + APP_CONFIG_VARS = load_yaml(f"env_config/{APP_CONFIG}.yaml") + + logger.info(f'APP_CONFIG: {APP_CONFIG}') + DB_USER = APP_CONFIG_VARS['PSQL_DB_USERNAME'] + DB_PASS = APP_CONFIG_VARS['PSQL_DB_PASSWORD'] + CONNECTION = APP_CONFIG_VARS['PSQL_DB_CONNECTION_NAME'] + DB = APP_CONFIG_VARS['PSQL_DB_NAME'] + + + config.update(BASE_VARS) + config.update(APP_CONFIG_VARS) + + config['json_encoder'] = json_encoder + config.update({"CENDR_VERSION": CENDR_VERSION, + "APP_CONFIG": APP_CONFIG, + "DATASET_RELEASE": DATASET_RELEASE, + "WORMBASE_VERSION": WORMBASE_VERSION, + "RELEASES": RELEASES}) + + config['DS_PREFIX'] = '' + if APP_CONFIG == 'development': + config['DS_PREFIX'] = 'DEV_' + cc = None + local = True if CLOUD_CONFIG == 1 else False + # Add configuration variables from cloud + cc = CloudConfig(DEFAULT_CLOUD_CONFIG, kind_prefix=config['DS_PREFIX'], local=local) + cc.load() + cc.get_external_content() + props = cc.get_properties() + config.update(props) + config['cloud_config'] = cc + + config['SQLALCHEMY_DATABASE_URI'] = f'postgres+psycopg2://{DB_USER}:{DB_PASS}@/{DB}?host=/cloudsql/{CONNECTION}' + + return config + + config = get_config(APP_CONFIG) diff --git a/base/constants.py b/base/constants.py index 1111a850..4e0478ad 100644 --- a/base/constants.py +++ b/base/constants.py @@ -6,15 +6,29 @@ Author: Daniel E. Cook (danielecook@gmail.com) """ -from base.config import WORMBASE_VERSION +import os + +WORMBASE_VERSION = 'WS276' + +STRAIN_PHOTO_PATH = 'photos/Celegans/' + +USER_ROLES = [('user', 'User'), ('admin', 'Admin')] +BAM_BAI_DOWNLOAD_SCRIPT_NAME = "bam_bai_signed_download_script.sh" class PRICES: - DIVERGENT_SET = 160 - STRAIN_SET = 640 - STRAIN = 15 - SHIPPING = 65 + DIVERGENT_SET = 160 + STRAIN_SET = 640 + STRAIN = 15 + SHIPPING = 65 + + +SHIPPING_OPTIONS = [('UPS', 'UPS'), + ('FEDEX', 'FEDEX'), + ('Flat Rate Shipping', '${} Flat Fee'.format(PRICES.SHIPPING))] +PAYMENT_OPTIONS = [('check', 'Check'), + ('credit_card', 'Credit Card')] # Maps chromosome in roman numerals to integer CHROM_NUMERIC = {"I": 1, @@ -25,6 +39,12 @@ class PRICES: "X": 6, "MtDNA": 7} + + +GOOGLE_CLOUD_BUCKET = 'elegansvariation.org' +GOOGLE_CLOUD_PROJECT_ID = 'andersen-lab' +GOOGLE_CLOUD_LOCATION = 'us-central1' + # WI Strain Info Dataset GOOGLE_SHEETS = {"orders": "1BCnmdJNRjQR3Bx8fMjD_IlTzmh3o7yj8ZQXTkk6tTXM", "WI": "1V6YHzblaDph01sFDI8YK_fP0H7sVebHQTXypGdiQIjI"} @@ -39,35 +59,31 @@ class URLS: URLs are stored here so they can be easily integrated into the database for provenance purposes. """ - # - # AWS URLS + # BAMs are now hosted on google cloud buckets # - BAM_URL_PREFIX = "https://s3.us-east-2.amazonaws.com/elegansvariation.org/bam" + BAM_URL_PREFIX = f"https://storage.googleapis.com/{GOOGLE_CLOUD_BUCKET}/bam" + + # Variant Annotation CSV + STRAIN_VARIANT_ANNOTATION_URL = "https://storage.googleapis.com/elegansvariation.org/db/WI.20210121.strain-annotation.bcsq.20210401.csv" """ Wormbase URLs """ - # Gene GTF - GENE_GTF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_VERSION}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_VERSION}.canonical_geneset.gtf.gz" - + GENE_GTF_URL = "ftp://ftp.wormbase.org/pub/wormbase/releases/{WB}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WB}.canonical_geneset.gtf.gz" # GENE GFF_URL - GENE_GFF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_VERSION}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_VERSION}.annotations.gff3.gz" - + GENE_GFF_URL = "ftp://ftp.wormbase.org/pub/wormbase/releases/{WB}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WB}.annotations.gff3.gz" # Maps wormbase ID to locus name GENE_IDS_URL = "ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz" - # Lists C. elegans orthologs ORTHOLOG_URL = "ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/PRJNA13758/annotation/orthologs/c_elegans.PRJNA13758.current_development.orthologs.txt" # # Ortholog URLs # - # Homologene HOMOLOGENE_URL = 'https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data' - # Taxon IDs TAXON_ID_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' @@ -91,4 +107,11 @@ class URLS: TABLE_COLORS = {"LOW": 'success', "MODERATE": 'warning', - "HIGH": 'danger'} \ No newline at end of file + "HIGH": 'danger'} + + +DEFAULT_CLOUD_CONFIG = 'default' + +REPORT_VERSIONS = ['', 'v1', 'v2'] +REPORT_V1_FILE_LIST = ['methods.md'] +REPORT_V2_FILE_LIST = ['alignment_report.html', 'concordance_report.html', 'gatk_report.html', 'methods.md', 'reads_mapped_by_strain.tsv', 'release_notes.md'] \ No newline at end of file diff --git a/base/database/__init__.py b/base/database/__init__.py index bc094675..3ae82224 100644 --- a/base/database/__init__.py +++ b/base/database/__init__.py @@ -1,179 +1,238 @@ import os import arrow import pickle - from rich.console import Console + from base import constants -from base.constants import URLS +from base.constants import URLS, GOOGLE_CLOUD_BUCKET +from base.config import config from base.utils.data_utils import download -from base.utils.gcloud import upload_file -from base.models import (db, +from base.utils.decorators import timeit +from base.models import (StrainAnnotatedVariants, db, Strain, Homologs, Metadata, WormbaseGene, WormbaseGeneSummary) -from base.config import (CENDR_VERSION, - APP_CONFIG, - DATASET_RELEASE, - WORMBASE_VERSION, - RELEASES) # ETL Pipelines - fetch and format data for -# input into the sqlite database +# input into the postgres database from base.database.etl_homologene import fetch_homologene from base.database.etl_strains import fetch_andersen_strains from base.database.etl_wormbase import (fetch_gene_gff_summary, fetch_gene_gtf, fetch_orthologs) +from base.database.etl_variant_annot import fetch_strain_variant_annotation_data -console = Console() DOWNLOAD_PATH = ".download" - +console = Console() def download_fname(download_path: str, download_url: str): - return os.path.join(download_path, - download_url.split("/")[-1]) - + return os.path.join(download_path, + download_url.split("/")[-1]) -def initialize_sqlite_database(sel_wormbase_version, +@timeit +def initialize_postgres_database(sel_wormbase_version, strain_only=False): - """Create a static sqlite database - Args: - sel_wormbase_version - e.g. WS245 - - Generate an sqlite database - """ - start = arrow.utcnow() - console.log("Initializing Database") - - SQLITE_PATH = f"base/cendr.{DATASET_RELEASE}.{sel_wormbase_version}.db" - SQLITE_BASENAME = os.path.basename(SQLITE_PATH) - - # Download wormbase files - if strain_only is False: - if os.path.exists(SQLITE_PATH): - os.remove(SQLITE_PATH) - - if not os.path.exists(DOWNLOAD_PATH): - os.makedirs(DOWNLOAD_PATH) - - # Parallel URL download - console.log("Downloading Wormbase Data") - download([URLS.GENE_GFF_URL, - URLS.GENE_GTF_URL, - URLS.GENE_IDS_URL, - URLS.HOMOLOGENE_URL, - URLS.ORTHOLOG_URL, - URLS.TAXON_ID_URL], - DOWNLOAD_PATH) - - gff_fname = download_fname(DOWNLOAD_PATH, URLS.GENE_GFF_URL) - gtf_fname = download_fname(DOWNLOAD_PATH, URLS.GENE_GTF_URL) - gene_ids_fname = download_fname(DOWNLOAD_PATH, URLS.GENE_IDS_URL) - homologene_fname = download_fname(DOWNLOAD_PATH, URLS.HOMOLOGENE_URL) - ortholog_fname = download_fname(DOWNLOAD_PATH, URLS.ORTHOLOG_URL) - - from base.application import create_app - app = create_app() - app.config['SQLALCHEMY_DATABASE_URI'] = f"sqlite:///{SQLITE_BASENAME}" - app.app_context().push() - - if strain_only is True: - db.metadata.drop_all(bind=db.engine, checkfirst=True, tables=[Strain.__table__]) - db.metadata.create_all(bind=db.engine, tables=[Strain.__table__]) - else: - db.create_all(app=app) - db.session.commit() - - console.log(f"Created {SQLITE_PATH}") - - ################ - # Load Strains # - ################ - console.log('Loading strains...') - db.session.bulk_insert_mappings(Strain, fetch_andersen_strains()) - db.session.commit() - console.log(f"Inserted {Strain.query.count()} strains") - - if strain_only is True: - console.log('Finished loading strains') - return - - ################ - # Set metadata # - ################ - console.log('Inserting metadata') - metadata = {} - metadata.update(vars(constants)) - metadata.update({"CENDR_VERSION": CENDR_VERSION, - "APP_CONFIG": APP_CONFIG, - "DATASET_RELEASE": DATASET_RELEASE, - "WORMBASE_VERSION": sel_wormbase_version, - "RELEASES": RELEASES, - "DATE": arrow.utcnow()}) - for k, v in metadata.items(): - if not k.startswith("_"): - # For nested constants: - if type(v) == type: - for name in [x for x in dir(v) if not x.startswith("_")]: - key_val = Metadata(key="{}/{}".format(k, name), - value=getattr(v, name)) - db.session.add(key_val) - else: - key_val = Metadata(key=k, value=str(v)) - db.session.add(key_val) - - db.session.commit() - - ############## - # Load Genes # - ############## - console.log('Loading summary gene table') - genes = fetch_gene_gff_summary(gff_fname) - db.session.bulk_insert_mappings(WormbaseGeneSummary, genes) - db.session.commit() - - console.log('Loading gene table') - db.session.bulk_insert_mappings(WormbaseGene, fetch_gene_gtf(gtf_fname, gene_ids_fname)) - gene_summary = db.session.query(WormbaseGene.feature, - db.func.count(WormbaseGene.feature)) \ - .group_by(WormbaseGene.feature) \ - .all() - gene_summary = '\n'.join([f"{k}: {v}" for k, v in gene_summary]) - console.log(f"============\nGene Summary\n------------\n{gene_summary}\n============") - - ############################### - # Load homologs and orthologs # - ############################### - console.log('Loading homologs from homologene') - db.session.bulk_insert_mappings(Homologs, fetch_homologene(homologene_fname)) - db.session.commit() - - console.log('Loading orthologs from WormBase') - db.session.bulk_insert_mappings(Homologs, fetch_orthologs(ortholog_fname)) - db.session.commit() - - ############# - # Upload DB # - ############# - - # Upload the file using todays date for archiving purposes - console.log(f"Uploading Database ({SQLITE_BASENAME})") - upload_file(f"db/{SQLITE_BASENAME}", SQLITE_PATH) - - diff = int((arrow.utcnow() - start).total_seconds()) - console.log(f"{diff} seconds") - - # =========================== # - # Generate gene id dict # - # =========================== # - # Create a gene dictionary to match wormbase IDs to either the locus name - # or a sequence id - gene_dict = {x.gene_id: x.locus or x.sequence_name for x in WormbaseGeneSummary.query.all()} - pickle.dump(gene_dict, open("base/static/data/gene_dict.pkl", 'wb')) - - -def download_sqlite_database(): - SQLITE_PATH = f"base/cendr.{DATASET_RELEASE}.{WORMBASE_VERSION}.db" - SQLITE_BASENAME = os.path.basename(SQLITE_PATH) - download([f"https://storage.googleapis.com/elegansvariation.org/db/{SQLITE_BASENAME}"], "base") + """Create a postgres database + Args: + sel_wormbase_version - e.g. WS276 + + Generate a postgres database + """ + console.log("Initializing Database") + DATASET_RELEASE = config['DATASET_RELEASE'] + + # Download wormbase files + if strain_only is False: + f = download_external_data(sel_wormbase_version) + + from base.application import create_app + app = create_app() + app.app_context().push() + + app.config['SQLALCHEMY_DATABASE_URI'] = f'postgresql://admin:password@localhost/cendr' + + + if strain_only is True: + reset_tables(app, db, tables=[Strain.__table__]) + else: + reset_tables(app, db) + + load_strains(db) + if strain_only is True: + console.log('Finished loading strains') + return + + load_metadata(db, sel_wormbase_version) + load_genes_summary(db, f) + load_genes_table(db, f) + load_homologs(db, f) + load_orthologs(db, f) + load_variant_annotation(db, f) + generate_gene_dict() + + +########################## +# Download external data # +########################## +@timeit +def download_external_data(sel_wormbase_version): + console.log('Downloading External Data...') + if not os.path.exists(DOWNLOAD_PATH): + os.makedirs(DOWNLOAD_PATH) + + # Parallel URL download + console.log("Downloading Wormbase Data") + GENE_GFF_URL = URLS.GENE_GFF_URL.format(WB=sel_wormbase_version) + GENE_GTF_URL = URLS.GENE_GTF_URL.format(WB=sel_wormbase_version) + download([URLS.STRAIN_VARIANT_ANNOTATION_URL, + GENE_GFF_URL, + GENE_GTF_URL, + URLS.GENE_IDS_URL, + URLS.HOMOLOGENE_URL, + URLS.ORTHOLOG_URL, + URLS.TAXON_ID_URL], + DOWNLOAD_PATH) + + fnames = { + "sva": download_fname(DOWNLOAD_PATH,URLS.STRAIN_VARIANT_ANNOTATION_URL), + "gff": download_fname(DOWNLOAD_PATH, GENE_GFF_URL), + "gtf": download_fname(DOWNLOAD_PATH, GENE_GTF_URL), + "gene_ids": download_fname(DOWNLOAD_PATH, URLS.GENE_IDS_URL), + "homologene": download_fname(DOWNLOAD_PATH, URLS.HOMOLOGENE_URL), + "ortholog": download_fname(DOWNLOAD_PATH, URLS.ORTHOLOG_URL) + } + return fnames + + +################ +# Reset Tables # +################ +@timeit +def reset_tables(app, db, tables = None): + if tables is None: + console.log('Dropping all tables...') + db.drop_all(app=app) + console.log('Creating all tables...') + db.create_all(app=app) + else: + console.log(f'Dropping tables: ${tables}') + db.metadata.drop_all(bind=db.engine, checkfirst=True, tables=tables) + console.log(f'Creating tables: ${tables}') + db.metadata.create_all(bind=db.engine, tables=tables) + + db.session.commit() + + + +################ +# Load Strains # +################ +@timeit +def load_strains(db): + console.log('Loading strains...') + andersen_strains = fetch_andersen_strains() + db.session.bulk_insert_mappings(Strain, andersen_strains) + db.session.commit() + console.log(f"Inserted {Strain.query.count()} strains") + + +################ +# Set metadata # +################ +@timeit +def load_metadata(db, sel_wormbase_version): + start = arrow.utcnow() + console.log('Inserting metadata') + metadata = {} + metadata.update(vars(constants)) + metadata.update({"CENDR_VERSION": config['CENDR_VERSION'], + "APP_CONFIG": config['APP_CONFIG'], + "DATASET_RELEASE": config['DATASET_RELEASE'], + "WORMBASE_VERSION": sel_wormbase_version, + "RELEASES": config['RELEASES'], + "DATE": arrow.utcnow()}) + + for k, v in metadata.items(): + if not k.startswith("_"): + # For nested constants: + if type(v) == type: + for name in [x for x in dir(v) if not x.startswith("_")]: + key_val = Metadata(key="{}/{}".format(k, name), + value=getattr(v, name)) + db.session.add(key_val) + else: + key_val = Metadata(key=k, value=str(v)) + db.session.add(key_val) + + db.session.commit() + + +############## +# Load Genes # +############## +@timeit +def load_genes_summary(db, f): + console.log('Loading summary gene table') + gene_summary = fetch_gene_gff_summary(f['gff']) + db.session.bulk_insert_mappings(WormbaseGeneSummary, gene_summary) + db.session.commit() + + +@timeit +def load_genes_table(db, f): + console.log('Loading gene table') + genes = fetch_gene_gtf(f['gtf'], f['gene_ids']) + db.session.bulk_insert_mappings(WormbaseGene, genes) + db.session.commit(); + + results = db.session.query(WormbaseGene.feature, db.func.count(WormbaseGene.feature)) \ + .group_by(WormbaseGene.feature) \ + .all() + result_summary = '\n'.join([f"{k}: {v}" for k, v in results]) + console.log(f"============\nGene Summary\n------------\n{result_summary}\n============\n") + + +############################### +# Load homologs # +############################### +@timeit +def load_homologs(db, f): + console.log('Loading homologs from homologene') + homologene = fetch_homologene(f['homologene']) + db.session.bulk_insert_mappings(Homologs, homologene) + db.session.commit() + + +############################### +# Load Orthologs # +############################### +@timeit +def load_orthologs(db, f): + console.log('Loading orthologs from WormBase') + orthologs = fetch_orthologs(f['ortholog']) + db.session.bulk_insert_mappings(Homologs, orthologs) + db.session.commit() + + +###################################### +# Load Strain Variant Annotated Data # +###################################### +@timeit +def load_variant_annotation(db, f): + console.log('Loading strain variant annotated csv') + sva_data = fetch_strain_variant_annotation_data(f['sva']) + db.session.bulk_insert_mappings(StrainAnnotatedVariants, sva_data) + db.session.commit() + + +# =========================== # +# Generate gene id dict # +# =========================== # +# Create a gene dictionary to match wormbase IDs to either the locus name +# or a sequence id +@timeit +def generate_gene_dict(): + console.log('Generating gene_dict.pkl') + gene_dict = {x.gene_id: x.locus or x.sequence_name for x in WormbaseGeneSummary.query.all()} + pickle.dump(gene_dict, open("base/static/data/gene_dict.pkl", 'wb')) diff --git a/base/database/etl_homologene.py b/base/database/etl_homologene.py index bb71d8bf..2798e527 100644 --- a/base/database/etl_homologene.py +++ b/base/database/etl_homologene.py @@ -8,11 +8,15 @@ import re import tarfile import csv + +from logzero import logger from urllib.request import urlretrieve from tempfile import NamedTemporaryFile from base.models import WormbaseGeneSummary from base.constants import URLS +C_ELEGANS_PREFIX = 'CELE_' +C_ELEGANS_HOMOLOG_ID = 6239 def fetch_taxon_ids(): """ @@ -57,19 +61,31 @@ def fetch_homologene(homologene_fname: str): taxon_ids = fetch_taxon_ids() # First, fetch records with a homolog ID that possesses a C. elegans gene. - elegans_set = dict([[int(x[0]), x[3]] for x in response_csv if x[1] == '6239']) + elegans_set = dict([[int(x[0]), x[3]] for x in response_csv if x[1] == str(C_ELEGANS_HOMOLOG_ID)]) + + # Remove CELE_ prefix from some gene names + for k, v in elegans_set.items(): + elegans_set[k] = v.replace(C_ELEGANS_PREFIX, '') + idx = 0 + count = 0 for line in response_csv: - tax_id = int(line[1]) - homolog_id = int(line[0]) - if homolog_id in elegans_set.keys() and tax_id != 6239: - # Try to resolve the wormbase WB ID if possible. - gene_name = elegans_set[homolog_id] - gene_id = WormbaseGeneSummary.resolve_gene_id(gene_name) or line[2] - yield {'gene_id': gene_id, - 'gene_name': gene_name, - 'homolog_species': taxon_ids[tax_id], - 'homolog_taxon_id': tax_id, - 'homolog_gene': line[3], - 'homolog_source': "Homologene", - 'is_ortholog': False} + idx += 1 + tax_id = int(line[1]) + homolog_id = int(line[0]) + if homolog_id in elegans_set.keys() and tax_id != int(C_ELEGANS_HOMOLOG_ID): + # Try to resolve the wormbase WB ID if possible. + gene_name = elegans_set[homolog_id] + gene_id = WormbaseGeneSummary.resolve_gene_id(gene_name) + ref = WormbaseGeneSummary.query.filter(WormbaseGeneSummary.gene_id == gene_id).first() + if idx % 10000 == 0: + logger.info(f'Processed {idx} records yielding {count} inserts') + if ref: + count += 1 + yield {'gene_id': gene_id, + 'gene_name': gene_name, + 'homolog_species': taxon_ids[tax_id], + 'homolog_taxon_id': tax_id, + 'homolog_gene': line[3], + 'homolog_source': "Homologene", + 'is_ortholog': False } diff --git a/base/database/etl_strains.py b/base/database/etl_strains.py index 7f45c04b..48c1fd06 100644 --- a/base/database/etl_strains.py +++ b/base/database/etl_strains.py @@ -15,6 +15,7 @@ from logzero import logger from base.config import config +NULL_VALS = ["None", "", "NA", None] def elevation_cache(func): """quick and simple cache for lat/lon""" @@ -71,13 +72,13 @@ def fetch_andersen_strains(): WI = get_google_sheet(config['ANDERSEN_LAB_STRAIN_SHEET']) strain_records = WI.get_all_records() # Only take records with a release reported - strain_records = list(filter(lambda x: x.get('release') not in ['', None, 'NA'], strain_records)) + strain_records = list(filter(lambda x: x.get('release') not in NULL_VALS, strain_records)) results = [] for n, record in enumerate(strain_records): record = {k.lower(): v for k, v in record.items()} for k, v in record.items(): # Set NA to None - if v in ["NA", '']: + if v in NULL_VALS: v = None record[k] = v if k in ['sampling_date'] and v: @@ -95,12 +96,12 @@ def fetch_andersen_strains(): record["issues"] = record["issues"] == "TRUE" # Set isotype_ref_strain = FALSE if no isotype is assigned. - if record['isotype'] in [None, "", "NA"]: + if record['isotype'] in NULL_VALS: record['isotype_ref_strain'] = False record['wgs_seq'] = False # Skip strains that lack an isotype - if record['isotype'] in [None, "", "NA"] and record['issues'] is False: + if record['isotype'] in NULL_VALS and record['issues'] is False: continue diff --git a/base/database/etl_variant_annot.py b/base/database/etl_variant_annot.py new file mode 100644 index 00000000..9ba2a37c --- /dev/null +++ b/base/database/etl_variant_annot.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +""" +Loads the Strain Variant Annotated CSV into the SQLite DB + +Author: Sam Wachspress +""" +import csv +import re + +from logzero import logger +from sqlalchemy.sql.expression import null +from base.models import StrainAnnotatedVariants + +def fetch_strain_variant_annotation_data(sva_fname: str): + """ + Load strain variant annotation table data: + + CHROM,POS,REF,ALT,CONSEQUENCE,WORMBASE_ID,TRANSCRIPT,BIOTYPE, + STRAND,AMINO_ACID_CHANGE,DNA_CHANGE,Strains,BLOSUM,Grantham, + Percent_Protein,GENE,VARIANT_IMPACT,DIVERGENT + + """ + with open(sva_fname) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + + line_count = -1 + for row in csv_reader: + if line_count == -1: + print(f'Column names are {", ".join(row)}') + line_count += 1 + else: + line_count += 1 + if line_count % 100000 == 0: + logger.info(f"Processed {line_count} lines;") + + target_consequence = None + consequence = row[4] if row[4] else None + pattern = '^@[0-9]*$' + alt_target = re.match(pattern, consequence) + if alt_target: + target_consequence = int(consequence[1:]) + consequence = None + + yield { + 'id': line_count, + 'chrom': row[0], + 'pos': int(row[1]), + 'ref_seq': row[2] if row[2] else None, + 'alt_seq': row[3] if row[3] else None, + 'consequence': consequence, + 'target_consequence': target_consequence, + 'gene_id': row[5] if row[5] else None, + 'transcript': row[6] if row[6] else None, + 'biotype': row[7] if row[7] else None, + 'strand': row[8] if row[8] else None, + 'amino_acid_change': row[9] if row[9] else None, + 'dna_change': row[10] if row[10] else None, + 'strains': row[11] if row[11] else None, + 'blosum': int(row[12]) if row[12] else None, + 'grantham': int(row[13]) if row[13] else None, + 'percent_protein': float(row[14]) if row[14] else None, + 'gene': row[15] if row[15] else None, + 'variant_impact': row[16] if row[16] else None, + 'divergent': True if row[17] == 'D' else False, + } + + print(f'Processed {line_count} lines.') diff --git a/base/database/etl_wormbase.py b/base/database/etl_wormbase.py index 0cd02e02..d6648239 100644 --- a/base/database/etl_wormbase.py +++ b/base/database/etl_wormbase.py @@ -8,6 +8,7 @@ Author: Daniel E. Cook (danielecook@gmail.com) """ +from base.models import WormbaseGeneSummary import csv import gzip from logzero import logger @@ -46,7 +47,11 @@ def fetch_gene_gtf(gtf_fname: str, gene_ids_fname: str): gene_gtf.frame = gene_gtf.frame.apply(lambda x: x if x != "." else None) gene_gtf.exon_number = gene_gtf.exon_number.apply(lambda x: x if x != "" else None) gene_gtf['arm_or_center'] = gene_gtf.apply(lambda row: arm_or_center(row['chrom'], row['pos']), axis=1) + idx = 0 for row in gene_gtf.to_dict('records'): + idx += 1 + if idx % 100000 == 0: + logger.info(f"Processed {idx} lines") yield row @@ -98,17 +103,25 @@ def fetch_orthologs(orthologs_fname: str): """ csv_out = list(csv.reader(open(orthologs_fname, 'r'), delimiter='\t')) + idx = 0 + count = 0 for line in csv_out: - size_of_line = len(line) - if size_of_line < 2: - continue - elif size_of_line == 2: - wb_id, locus_name = line - else: - yield {'gene_id': wb_id, - 'gene_name': locus_name, - 'homolog_species': line[0], - 'homolog_taxon_id': None, - 'homolog_gene': line[2], - 'homolog_source': line[3], - 'is_ortholog': line[0] == 'Caenorhabditis elegans'} + idx += 1 + size_of_line = len(line) + if size_of_line < 2: + continue + elif size_of_line == 2: + wb_id, locus_name = line + else: + ref = WormbaseGeneSummary.query.filter(WormbaseGeneSummary.gene_id == wb_id).first() + if idx % 10000 == 0: + logger.info(f'Processed {idx} records yielding {count} inserts') + if ref: + count += 1 + yield {'gene_id': wb_id, + 'gene_name': locus_name, + 'homolog_species': line[0], + 'homolog_taxon_id': None, + 'homolog_gene': line[2], + 'homolog_source': line[3], + 'is_ortholog': line[0] == 'Caenorhabditis elegans'} diff --git a/base/database/readme.md b/base/database/readme.md new file mode 100644 index 00000000..87d0c4eb --- /dev/null +++ b/base/database/readme.md @@ -0,0 +1,13 @@ +# CeNDR Database + +This directory contains the scripts to perform the 'initdb' flask action. +It requires a local PostgreSQL instance to be running. + +The table can then be dumped with + +''' +pg_dump -U admin --format=plain --no-owner --no-acl cendr > cendr.sql + +''' + +The .sql file can then be uploaded to Google Cloud Buckets and batch imported to the Cloud SQL instance diff --git a/base/extensions.py b/base/extensions.py index ed5042e3..c846102e 100644 --- a/base/extensions.py +++ b/base/extensions.py @@ -5,9 +5,12 @@ from flask_sslify import SSLify from flask_debugtoolbar import DebugToolbarExtension from flask_sqlalchemy import SQLAlchemy +from flask_jwt_extended import JWTManager -sqlalchemy = SQLAlchemy + +sqlalchemy = SQLAlchemy() markdown = Markdown cache = Cache(config={'CACHE_TYPE': 'base.utils.cache.datastore_cache'}) sslify = SSLify debug_toolbar = DebugToolbarExtension +jwt = JWTManager() diff --git a/base/forms.py b/base/forms.py index 09bbe586..6e20e02e 100644 --- a/base/forms.py +++ b/base/forms.py @@ -2,20 +2,34 @@ import pandas as pd import numpy as np -from flask_wtf import Form, RecaptchaField +from flask_wtf import FlaskForm, RecaptchaField, Form from wtforms import (StringField, + DateField, + BooleanField, TextAreaField, IntegerField, SelectField, + SelectMultipleField, + widgets, FieldList, HiddenField, RadioField) -from wtforms.validators import Required, Length, Email, DataRequired -from wtforms.validators import ValidationError +from wtforms.fields.simple import PasswordField +from wtforms.validators import (Required, + Length, + Email, + DataRequired, + EqualTo, + Optional, + ValidationError) + +from wtforms.fields.html5 import EmailField + +from base.constants import PRICES, USER_ROLES, SHIPPING_OPTIONS, PAYMENT_OPTIONS from base.utils.gcloud import query_item -from base.constants import PRICES +from base.models import user_ds from base.views.api.api_strain import query_strains from base.utils.data_utils import is_number, list_duplicates from slugify import slugify @@ -23,24 +37,87 @@ from logzero import logger - -class donation_form(Form): - """ - The donation form - """ - name = StringField('Name', [Required(), Length(min=3, max=100)]) - address = TextAreaField('Address', [Length(min=10, max=200)]) - email = StringField('Email', [Email(), Length(min=3, max=100)]) - total = IntegerField('Donation Amount') - recaptcha = RecaptchaField() +class MultiCheckboxField(SelectMultipleField): + widget = widgets.ListWidget(prefix_label=False) + option_widget = widgets.CheckboxInput() + + +class file_upload_form(FlaskForm): + pass + + +class basic_login_form(FlaskForm): + """ + The simple username/password login form + """ + username = StringField('Username', [Required(), Length(min=5, max=30)]) + password = PasswordField('Password', [Required(), Length(min=5, max=30)]) + recaptcha = RecaptchaField() + + +class markdown_form(FlaskForm): + """ + markdown editing form + """ + title = StringField('Title', [Optional()]) + content = StringField('Content', [Optional()]) + date = DateField('Date (mm-dd-YYYY)', [Optional()], format='%m-%d-%Y') + type = StringField('Type', [Optional()]) + publish = BooleanField('Publish', [Optional()]) + + +class user_register_form(FlaskForm): + """ + Register as a new user with username/password + """ + username = StringField('Username', [Required(), Length(min=5, max=30)]) + full_name = StringField('Full Name', [Required(), Length(min=5, max=50)]) + email = EmailField('Email Address', [Required(), Email(), Length(min=6, max=50)]) + password = PasswordField('Password', [Required(), EqualTo('confirm_password', message='Passwords must match'), Length(min=5, max=30)]) + confirm_password = PasswordField('Confirm Password', [Required(), EqualTo('password', message='Passwords must match'), Length(min=5, max=30)]) + recaptcha = RecaptchaField() + + def validate_username(form, field): + user = user_ds(field.data) + if user._exists: + raise ValidationError("Username already exists") + + +class user_update_form(FlaskForm): + """ + Modifies an existing users profile + """ + full_name = StringField('Full Name', [Required(), Length(min=5, max=50)]) + email = EmailField('Email Address', [Required(), Email(), Length(min=6, max=50)]) + password = PasswordField('Password', [Optional(), EqualTo('confirm_password', message='Passwords must match'), Length(min=5, max=30)]) + confirm_password = PasswordField('Confirm Password', [Optional(), EqualTo('password', message='Passwords must match'), Length(min=5, max=30)]) + + +class admin_edit_user_form(FlaskForm): + """ + A form for one or more roles + """ + roles = MultiCheckboxField('User Roles', choices=USER_ROLES) + + +class data_report_form(FlaskForm): + """ + A form for creating a data release + """ + dataset = SelectField('Release Dataset', validators=[Required()]) + wormbase = StringField('Wormbase Version WS:', validators=[Required()]) + version = SelectField('Report Version', validators=[Required()]) -SHIPPING_OPTIONS = [('UPS', 'UPS'), - ('FEDEX', 'FEDEX'), - ('Flat Rate Shipping', '${} Flat Fee'.format(PRICES.SHIPPING))] - -PAYMENT_OPTIONS = [('check', 'Check'), - ('credit_card', 'Credit Card')] +class donation_form(Form): + """ + The donation form + """ + name = StringField('Name', [Required(), Length(min=3, max=100)]) + address = TextAreaField('Address', [Length(min=10, max=200)]) + email = StringField('Email', [Email(), Length(min=3, max=100)]) + total = IntegerField('Donation Amount') + recaptcha = RecaptchaField() class order_form(Form): @@ -100,8 +177,10 @@ class heritability_form(Form): # -# Perform Mapping Form +# Variant Browser Forms # +class vbrowser_form(FlaskForm): + pass class TraitData(HiddenField): diff --git a/base/manage.py b/base/manage.py index eef1eded..cb5b72f8 100644 --- a/base/manage.py +++ b/base/manage.py @@ -12,8 +12,7 @@ from click import secho from base.utils.gcloud import get_item from base.utils.data_utils import zipdir -from base.database import (initialize_sqlite_database, - download_sqlite_database) +from base.database import initialize_postgres_database from base import constants from subprocess import Popen, PIPE @@ -23,19 +22,21 @@ @click.command(help="Initialize the database") @click.argument("wormbase_version", default=constants.WORMBASE_VERSION) def initdb(wormbase_version=constants.WORMBASE_VERSION): - initialize_sqlite_database(wormbase_version) + initialize_postgres_database(wormbase_version) @click.command(help="Updates the strain table of the database") @click.argument("wormbase_version", default=constants.WORMBASE_VERSION) def update_strains(wormbase_version): - initialize_sqlite_database(wormbase_version, strain_only=True) + initialize_postgres_database(wormbase_version, strain_only=True) -@click.command(help="Download the database (used in docker container)") -def download_db(): +# Todo: allow downloading postgres dump/local db in docker container +# or just link to .sql in cloud storage (even better!) +#@click.command(help="Download the database (used in docker container)") +#def download_db(): # Downloads the latest SQLITE database - download_sqlite_database() + #download_sqlite_database() @click.command(help="Update credentials") @@ -46,6 +47,7 @@ def update_credentials(): from base.application import create_app app = create_app() app.app_context().push() + click.secho("Zipping env_config", fg='green') zipdir('env_config/', 'env_config.zip') zip_creds = get_item('credential', 'travis-ci-cred') @@ -73,6 +75,7 @@ def decrypt_credentials(): from base.application import create_app app = create_app() app.app_context().push() + click.secho("Decrypting env_config.zip.enc", fg='green') zip_creds = get_item('credential', 'travis-ci-cred') comm = ['travis', diff --git a/base/models.py b/base/models.py index fd1264cf..da51d248 100644 --- a/base/models.py +++ b/base/models.py @@ -1,23 +1,27 @@ import os +import re import arrow import json import pandas as pd import numpy as np import datetime import requests + from io import StringIO from flask import Markup, url_for from flask_sqlalchemy import SQLAlchemy from sqlalchemy import or_, func -from logzero import logger +from werkzeug.security import safe_str_cmp -from base.constants import URLS +from base.config import config +from base.constants import GOOGLE_CLOUD_BUCKET, STRAIN_PHOTO_PATH +from base.extensions import sqlalchemy from base.utils.gcloud import get_item, store_item, query_item, get_cendr_bucket, check_blob from base.utils.aws import get_aws_client +from base.utils.data_utils import hash_password, unique_id from gcloud.datastore.entity import Entity from collections import defaultdict from botocore.exceptions import ClientError -from base.config import DATASET_RELEASE db = SQLAlchemy() @@ -83,6 +87,7 @@ class trait_ds(datastore_model): If a task is re-run the report will only display the latest version. """ kind = 'trait' + kind = '{}{}'.format(config['DS_PREFIX'], kind) def __init__(self, *args, **kwargs): """ @@ -233,9 +238,9 @@ def gs_base_url(self): The URL schema changed from REPORT_VERSION v1 to v2. """ if self.REPORT_VERSION == 'v2': - return f"https://storage.googleapis.com/elegansvariation.org/reports/{self.gs_path}" + return f"https://storage.googleapis.com/{GOOGLE_CLOUD_BUCKET}/reports/{self.gs_path}" elif self.REPORT_VERSION == 'v1': - return f"https://storage.googleapis.com/elegansvariation.org/reports/{self.gs_path}" + return f"https://storage.googleapis.com/{GOOGLE_CLOUD_BUCKET}/reports/{self.gs_path}" def get_gs_as_dataset(self, fname): """ @@ -260,7 +265,7 @@ def list_report_files(self): cendr_bucket = get_cendr_bucket() items = cendr_bucket.list_blobs(prefix=f"reports/{self.gs_path}") - return {os.path.basename(x.name): f"https://storage.googleapis.com/elegansvariation.org/{x.name}" for x in items} + return {os.path.basename(x.name): f"https://storage.googleapis.com/{GOOGLE_CLOUD_BUCKET}/{x.name}" for x in items} def file_url(self, fname): """ @@ -276,6 +281,7 @@ class mapping_ds(datastore_model): The mapping/peak interval model """ kind = 'mapping' + kind = '{}{}'.format(config['DS_PREFIX'], kind) def __init__(self, *args, **kwargs): super(mapping_ds, self).__init__(*args, **kwargs) @@ -287,14 +293,35 @@ class user_ds(datastore_model): information on users. """ kind = 'user' + kind = '{}{}'.format(config['DS_PREFIX'], kind) def __init__(self, *args, **kwargs): super(user_ds, self).__init__(*args, **kwargs) + + def set_properties(self, **kwargs): + if 'username' in kwargs: + self.username = kwargs.get('username') + if 'full_name' in kwargs: + self.full_name = kwargs.get('full_name') + if 'password' in kwargs: + self.set_password(kwargs.get('password'), kwargs.get('salt')) + if 'email' in kwargs: + self.set_email(kwargs.get('email')) + if 'roles' in kwargs: + self.roles = kwargs.get('roles') + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + if not self._exists: + self.created_on = now + self.modified_on = now + super(user_ds, self).save(*args, **kwargs) + def reports(self): - filters = [('user_id', '=', self.user_id)] + filters = [('user_id', '=', self.name)] # Note this requires a composite index defined very precisely. - results = query_item('trait', filters=filters, order=['user_id', '-created_on']) + results = query_item(self.kind, filters=filters, order=['user_id', '-created_on']) results = sorted(results, key=lambda x: x['created_on'], reverse=True) results_out = defaultdict(list) for row in results: @@ -302,17 +329,226 @@ def reports(self): # Generate report objects return results_out + def get_all(self, keys_only=False): + results = query_item(self.kind, keys_only=keys_only) + return results + + def set_password(self, password, salt): + # calling set_password with self.password + if hasattr(self, 'password'): + if (len(password) > 0) and (password != self.password): + self.password = hash_password(password + salt) + else: + self.password = hash_password(password + salt) + + def set_email(self, email): + if hasattr(self, 'email'): + if not safe_str_cmp(email, self.email): + self.email = email + self.email_confirmation_code = unique_id() + self.verified_email = False + else: + self.email = email + self.email_confirmation_code = unique_id() + self.verified_email = False + + def check_password(self, password, salt): + return safe_str_cmp(self.password, hash_password(password + salt)) + + +class markdown_ds(datastore_model): + """ + The Markdown model - for creating and retrieving + documents uploaded to the site + """ + kind = 'markdown' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + def __init__(self, *args, **kwargs): + super(markdown_ds, self).__init__(*args, **kwargs) + + def get_all(self, keys_only=False): + results = query_item(self.kind, keys_only=keys_only) + return results + + def query_by_type(self, type, keys_only=False): + filters = [('type', '=', type)] + results = query_item(self.kind, filters=filters, keys_only=keys_only) + return results + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + if not self._exists: + self.created_on = now + super(markdown_ds, self).save(*args, **kwargs) + + +class ns_calc_ds(datastore_model): + """ + The NemaScan Task Model - metadata for NemaScan nextflow pipeline + execution tasks executed by Google Life Sciences + """ + kind = 'ns_calc' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + + def __init__(self, *args, **kwargs): + super(ns_calc_ds, self).__init__(*args, **kwargs) + + def query_by_username(self, username, keys_only=False): + filters = [('username', '=', username)] + results = query_item(self.kind, filters=filters, keys_only=keys_only) + return results + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + if not self._exists: + self.created_on = now + super(ns_calc_ds, self).save(*args, **kwargs) + + +class gls_op_ds(datastore_model): + """ + The Google Lifesciences Operation Model - metadata for pipeline + task executed by Google Life Sciences + """ + kind = 'gls_operation' + + def __init__(self, *args, **kwargs): + super(gls_op_ds, self).__init__(*args, **kwargs) + + +class h2calc_ds(datastore_model): + """ + The Heritability Calculation Task Model - for creating and retrieving + data and status information about a heritability calculation task + executed in Google Cloud Run + """ + kind = 'h2calc' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + + def __init__(self, *args, **kwargs): + super(h2calc_ds, self).__init__(*args, **kwargs) + + def query_by_username(self, username, keys_only=False): + filters = [('username', '=', username)] + results = query_item(self.kind, filters=filters, keys_only=keys_only) + return results + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + if not self._exists: + self.created_on = now + super(h2calc_ds, self).save(*args, **kwargs) + + + +class ip_calc_ds(datastore_model): + """ + The Indel Primer Calculation Task Model - for creating and retrieving + data and status information about an indel primer calculation task + executed in Google Cloud Run + """ + kind = 'ip_calc' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + + def __init__(self, *args, **kwargs): + super(ip_calc_ds, self).__init__(*args, **kwargs) + + def query_by_username(self, username, keys_only=False): + filters = [('username', '=', username)] + results = query_item(self.kind, filters=filters, keys_only=keys_only) + return results + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + if not self._exists: + self.created_on = now + super(ip_calc_ds, self).save(*args, **kwargs) + + +class data_report_ds(datastore_model): + """ + The Data Report model - for creating and retrieving + releases of genomic data + """ + kind = 'data-report' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + def init(self): + self.dataset = '' + self.wormbase = '' + self.version = '' + self.initialized = False + self.published_on = '' + self.publish = False + self.created_on = arrow.utcnow().datetime + self.report_synced_on = '' + self.db_synced_on = '' + + def __init__(self, *args, **kwargs): + super(data_report_ds, self).__init__(*args, **kwargs) + + def get_all(self, keys_only=False): + results = query_item(self.kind, keys_only=keys_only) + return results + + def list_bucket_dirs(): + """ + Lists 'directories' in GCP Bucket 'data_reports' (unique blob prefixes matching date format) + """ + cendr_bucket = get_cendr_bucket() + items = cendr_bucket.list_blobs(prefix=f"data_reports/") + dirs = [] + pattern = r"^(data_reports\/)([0-9]{8})/" + for i in items: + match = re.search(pattern, i.name) + if match: + dir = match.group(2) + if not dir in dirs: + dirs.append(dir) + + return dirs + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + super(data_report_ds, self).save(*args, **kwargs) + + +class config_ds(datastore_model): + """ + The Data Config model - Config stored in the cloud + for the site's data sources + """ + kind = 'config' + kind = '{}{}'.format(config['DS_PREFIX'], kind) + + def __init__(self, *args, **kwargs): + super(config_ds, self).__init__(*args, **kwargs) + + def save(self, *args, **kwargs): + now = arrow.utcnow().datetime + self.modified_on = now + if not self._exists: + self.created_on = now + super(config_ds, self).save(*args, **kwargs) class DictSerializable(object): - def _asdict(self): - result = {} - for key in self.__mapper__.c.keys(): - result[key] = getattr(self, key) - return result + def _asdict(self): + result = {} + for key in self.__mapper__.c.keys(): + result[key] = getattr(self, key) + return result # --------- Break datastore here ---------# - class Metadata(DictSerializable, db.Model): """ Table for storing information about other tables @@ -322,6 +558,48 @@ class Metadata(DictSerializable, db.Model): value = db.Column(db.String) +class WormbaseGeneSummary(DictSerializable, db.Model): + """ + This is a condensed version of the WormbaseGene model; + It is constructed out of convenience and only defines the genes + (not exons/introns/etc.) + """ + __tablename__ = "wormbase_gene_summary" + id = db.Column(db.Integer, primary_key=True) + chrom = db.Column(db.String(7), index=True) + chrom_num = db.Column(db.Integer(), index=True) + start = db.Column(db.Integer(), index=True) + end = db.Column(db.Integer(), index=True) + locus = db.Column(db.String(30), index=True) + gene_id = db.Column(db.String(25), unique=True, index=True) + gene_id_type = db.Column(db.String(15), index=False) + sequence_name = db.Column(db.String(30), index=True) + biotype = db.Column(db.String(30), nullable=True) + gene_symbol = db.column_property(func.coalesce(locus, sequence_name, gene_id)) + interval = db.column_property(func.format("%s:%s-%s", chrom, start, end)) + arm_or_center = db.Column(db.String(12), index=True) + + __gene_id_constraint__ = db.UniqueConstraint(gene_id) + + + def to_json(self): + return {k: v for k, v in self._asdict().items() if not k.startswith("_")} + + + @classmethod + def resolve_gene_id(cls, query): + """ + query - a locus name or transcript ID + output - a wormbase gene ID + + Example: + WormbaseGene.resolve_gene_id('pot-2') --> WBGene00010195 + """ + result = cls.query.filter(or_(cls.locus == query, cls.sequence_name == query)).first() + if result: + return result.gene_id + + class Strain(DictSerializable, db.Model): __tablename__ = "strain" species_id_method = db.Column(db.String(50), nullable=True) @@ -365,12 +643,19 @@ def __repr__(self): return self.strain def to_json(self): - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + return {k: v for k, v in self._asdict().items() if not k.startswith("_")} def strain_photo_url(self): # Checks if photo exists and returns URL if it does try: - return check_blob(f"photos/isolation/{self.strain}.jpg").public_url + return check_blob(f"{STRAIN_PHOTO_PATH}{self.strain}.jpg").public_url + except AttributeError: + return None + + def strain_thumbnail_url(self): + # Checks if thumbnail exists and returns URL if it does + try: + return check_blob(f"{STRAIN_PHOTO_PATH}{self.strain}.thumb.jpg").public_url except AttributeError: return None @@ -378,13 +663,16 @@ def strain_bam_url(self): """ Return bam / bam_index url set """ - + bam_file=self.strain + '.bam' + bai_file=self.strain + '.bam.bai' + bam_download_link = url_for('data.download_bam_url', blob_name=bam_file) + bai_download_link = url_for('data.download_bam_url', blob_name=bai_file) url_set = Markup(f""" - + BAM / - + bai """.strip()) @@ -404,13 +692,16 @@ def isotype_bam_url(self): """ Return bam / bam_index url set """ - + bam_file=self.isotype + '.bam' + bai_file=self.isotype + '.bam.bai' + bam_download_link = url_for('data.download_bam_url', blob_name=bam_file) + bai_download_link = url_for('data.download_bam_url', blob_name=bai_file) url_set = Markup(f""" - + BAM / - + bai """.strip()) @@ -452,19 +743,25 @@ def cum_sum_strain_isotype(cls): @classmethod def release_summary(cls, release): - """ - Returns isotype and strain count for a data release. - - Args: - release - the data release - """ - counts = {'strain_count': cls.query.filter((cls.release <= release) & (cls.issues == False)).count(), - 'strain_count_sequenced': cls.query.filter((cls.release <= release) & (cls.issues == False) & (cls.sequenced == True)).count(), - 'isotype_count': cls.query.filter((cls.release <= release) & (cls.issues == False) & (cls.isotype != None)).group_by(cls.isotype).count()} - return counts + """ + Returns isotype and strain count for a data release. + + Args: + release - the data release + """ + release = int(release) + strain_count = cls.query.filter((cls.release <= release) & (cls.issues == False)).count() + strain_count_sequenced = cls.query.filter((cls.release <= release) & (cls.issues == False) & (cls.sequenced == True)).count() + isotype_count = cls.query.with_entities(cls.isotype).filter((cls.isotype != None), (cls.release <= release), (cls.issues == False)).group_by(cls.isotype).count() + + return { + 'strain_count': strain_count, + 'strain_count_sequenced': strain_count_sequenced, + 'isotype_count': isotype_count + } def as_dict(self): - return {c.name: getattr(self, c.name) for c in self.__table__.columns} + return {c.name: getattr(self, c.name) for c in self.__table__.columns} class WormbaseGene(DictSerializable, db.Model): @@ -487,45 +784,15 @@ class WormbaseGene(DictSerializable, db.Model): protein_id = db.Column(db.String(30), nullable=True, index=True) arm_or_center = db.Column(db.String(12), index=True) - gene_summary = db.relationship("WormbaseGeneSummary", backref='gene_components') + __gene_summary__ = db.relationship("WormbaseGeneSummary", backref='wormbase_gene', lazy='joined') - def __repr__(self): - return f"{self.gene_id}:{self.feature} [{self.seqname}:{self.start}-{self.end}]" + def to_json(self): + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} -class WormbaseGeneSummary(DictSerializable, db.Model): - """ - This is a condensed version of the WormbaseGene model; - It is constructed out of convenience and only defines the genes - (not exons/introns/etc.) - """ - __tablename__ = "wormbase_gene_summary" - id = db.Column(db.Integer, primary_key=True) - chrom = db.Column(db.String(7), index=True) - chrom_num = db.Column(db.Integer(), index=True) - start = db.Column(db.Integer(), index=True) - end = db.Column(db.Integer(), index=True) - locus = db.Column(db.String(30), index=True) - gene_id = db.Column(db.String(25), index=True) - gene_id_type = db.Column(db.String(15), index=False) - sequence_name = db.Column(db.String(30), index=True) - biotype = db.Column(db.String(30), nullable=True) - gene_symbol = db.column_property(func.coalesce(locus, sequence_name, gene_id)) - interval = db.column_property(func.printf("%s:%s-%s", chrom, start, end)) - arm_or_center = db.Column(db.String(12), index=True) - - @classmethod - def resolve_gene_id(cls, query): - """ - query - a locus name or transcript ID - output - a wormbase gene ID - Example: - WormbaseGene.resolve_gene_id('pot-2') --> WBGene00010195 - """ - result = cls.query.filter(or_(cls.locus == query, cls.sequence_name == query)).first() - if result: - return result.gene_id + def __repr__(self): + return f"{self.gene_id}:{self.feature} [{self.seqname}:{self.start}-{self.end}]" class Homologs(DictSerializable, db.Model): @@ -534,23 +801,149 @@ class Homologs(DictSerializable, db.Model): """ __tablename__ = "homologs" id = db.Column(db.Integer, primary_key=True) - gene_id = db.Column(db.ForeignKey('wormbase_gene_summary.gene_id'), nullable=False, index=True) - gene_name = db.Column(db.String(40), index=True) - homolog_species = db.Column(db.String(50), index=True) + gene_id = db.Column(db.ForeignKey('wormbase_gene_summary.gene_id'), nullable=True, index=True) + gene_name = db.Column(db.String(60), index=True) + homolog_species = db.Column(db.String(60), index=True) homolog_taxon_id = db.Column(db.Integer, index=True, nullable=True) # If available - homolog_gene = db.Column(db.String(50), index=True) - homolog_source = db.Column(db.String(40)) + homolog_gene = db.Column(db.String(60), index=True) + homolog_source = db.Column(db.String(60)) + is_ortholog = db.Column(db.Boolean(), index=True, nullable=True) + + __gene_summary__ = db.relationship("WormbaseGeneSummary", backref='homologs', lazy='joined') + + + def to_json(self): + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - gene_summary = db.relationship("WormbaseGeneSummary", backref='homologs', lazy='joined') def unnest(self): """ Used with the gene API - returns an unnested homolog datastructure combined with the wormbase gene summary model. """ - self.__dict__.update(self.gene_summary.__dict__) - self.__dict__['gene_summary'] = None + self.__dict__.update(self.__gene_summary__.__dict__) return self def __repr__(self): return f"homolog: {self.gene_name} -- {self.homolog_gene}" + + +class StrainAnnotatedVariants(DictSerializable, db.Model): + """ + The Strain Annotated Variant table combines several features linked to variants: + Genetic location, base pairs affected, consequences of reading, gene information, + strains affected, and severity of impact + + """ + __tablename__ = 'variant_annotation' + id = db.Column(db.Integer, primary_key=True) + chrom = db.Column(db.String(7), index=True) + pos = db.Column(db.Integer(), index=True) + ref_seq = db.Column(db.String(), nullable=True) + alt_seq = db.Column(db.String(), nullable=True) + consequence = db.Column(db.String(), nullable=True) + target_consequence = db.Column(db.Integer(), nullable=True) + gene_id = db.Column(db.ForeignKey('wormbase_gene_summary.gene_id'), index=True, nullable=True) + transcript = db.Column(db.String(), index=True, nullable=True) + biotype = db.Column(db.String(), nullable=True) + strand = db.Column(db.String(1), nullable=True) + amino_acid_change = db.Column(db.String(), nullable=True) + dna_change = db.Column(db.String(), nullable=True) + strains = db.Column(db.String(), nullable=True) + blosum = db.Column(db.Integer(), nullable=True) + grantham = db.Column(db.Integer(), nullable=True) + percent_protein = db.Column(db.Float(), nullable=True) + gene = db.Column(db.String(), index=True, nullable=True) + variant_impact = db.Column(db.String(), nullable=True) + divergent = db.Column(db.Boolean(), nullable=True) + + __gene_summary__ = db.relationship("WormbaseGeneSummary", backref='variant_annotation', lazy='joined') + + + column_details = [ + {'id': 'chrom', 'name': 'Chromosome'}, + {'id': 'pos', 'name': 'Position'}, + {'id': 'ref_seq', 'name': 'Ref Sequence'}, + {'id': 'alt_seq', 'name': 'Alt Sequence'}, + {'id': 'consequence', 'name': 'Consequence'}, + {'id': 'target_consequence', 'name': 'Target Consequence'}, + {'id': 'gene_id', 'name': 'Gene ID'}, + {'id': 'transcript', 'name': 'Transcript'}, + {'id': 'biotype', 'name': 'Biotype'}, + {'id': 'strand', 'name': 'Strand'}, + {'id': 'amino_acid_change', 'name': 'Amino Acid Change'}, + {'id': 'dna_change', 'name': 'DNA Change'}, + {'id': 'strains', 'name': 'Strains'}, + {'id': 'blosum', 'name': 'BLOSUM'}, + {'id': 'grantham', 'name': 'Grantham'}, + {'id': 'percent_protein', 'name': 'Percent Protein'}, + {'id': 'gene', 'name': 'Gene'}, + {'id': 'variant_impact', 'name': 'Variant Impact'}, + {'id': 'divergent', 'name': 'Divergent'} + ] + + @classmethod + def generate_interval_sql(cls, interval): + interval = interval.replace(',','') + chrom = interval.split(':')[0] + range = interval.split(':')[1] + start = int(range.split('-')[0]) + stop = int(range.split('-')[1]) + + q = f"SELECT * FROM {cls.__tablename__} WHERE chrom='{chrom}' AND pos > {start} AND pos < {stop};" + return q + + + ''' TODO: implement input checks here and in the browser form''' + @classmethod + def verify_interval_query(cls, q): + query_regex = "^(I|II|III|IV|V|X|MtDNA):[0-9,]+-[0-9,]+$" + match = re.search(query_regex, q) + return True if match else False + + + @classmethod + def run_interval_query(cls, q): + q = cls.generate_interval_sql(q) + df = pd.read_sql_query(q, db.engine) + + try: + result = df[['id', 'chrom', 'pos', 'ref_seq', 'alt_seq', 'consequence', 'target_consequence', 'gene_id', 'transcript', 'biotype', 'strand', 'amino_acid_change', 'dna_change', 'strains', 'blosum', 'grantham', 'percent_protein', 'gene', 'variant_impact', 'divergent']].dropna(how='all') \ + .fillna(value="") \ + .agg(list) \ + .to_dict() + except ValueError: + result = {} + return result + + + @classmethod + def generate_position_sql(cls, pos): + pos = pos.replace(',','') + chrom = pos.split(':')[0] + pos = int(pos.split(':')[1]) + + q = f"SELECT * FROM {cls.__tablename__} WHERE chrom='{chrom}' AND pos = {pos};" + return q + + + @classmethod + def verify_position_query(cls, q): + query_regex = "^(I|II|III|IV|V|X|MtDNA):[0-9,]+$" + match = re.search(query_regex, q) + return True if match else False + + + @classmethod + def run_position_query(cls, q): + q = cls.generate_position_sql(q) + df = pd.read_sql_query(q, db.engine) + + try: + result = df[['id', 'chrom', 'pos', 'ref_seq', 'alt_seq', 'consequence', 'target_consequence', 'gene_id', 'transcript', 'biotype', 'strand', 'amino_acid_change', 'dna_change', 'strains', 'blosum', 'grantham', 'percent_protein', 'gene', 'variant_impact', 'divergent']].dropna(how='all') \ + .fillna(value="") \ + .agg(list) \ + .to_dict() + except ValueError: + result = {} + return result diff --git a/base/static/content/help/Change-Log.md b/base/static/content/help/Change-Log.md index f0fc719c..bc1b7b02 100644 --- a/base/static/content/help/Change-Log.md +++ b/base/static/content/help/Change-Log.md @@ -2,12 +2,12 @@ --- -##### v1.5.2 (2020-09-07) +#### v1.5.2 (2020-09-07) * A divergent region summmary track has been added to the primer indel tool. * Sweep haplotypes have been added to the latest release. -##### v1.5.1 (2020-08-30) +#### v1.5.1 (2020-08-30) * The [primer indel tool](/tools/pairwise_indel_finder) has been released. diff --git a/base/static/content/help/FAQ.md b/base/static/content/help/FAQ.md index a9c74e67..7ecdc88b 100644 --- a/base/static/content/help/FAQ.md +++ b/base/static/content/help/FAQ.md @@ -31,7 +31,7 @@ Or use this bibtex entry ### What are hyper-divergent regions? How should I use variants that fall within these regions? -Hyper-divergent regions are genomic intervals that contain sequences not found in the N2 reference strain. They were identified by high levels of variation and low coverage from read alignments. For a more full description, please read [this paper](https://andersenlab.org/publications/2020LeebioRxiv.pdf). We highly recommend that you use the variant browser and view the BAM files for strains of interest. We also released a genomic view track to see where we have classified divergent regions. If you find that your region of interest overlaps with a hyper-divergent region, then we recommend taking any variants as preliminary. Long-read sequencing is required to identify the actual genomic sequences in this region. +Hyper-divergent regions are genomic intervals that contain sequences not found in the N2 reference strain. They were identified by high levels of variation and low coverage from read alignments. For a more full description, please read [this paper](https://andersenlab.org/publications/2020LeebioRxiv.pdf). We highly recommend that you use the genome browser and view the BAM files for strains of interest. We also released a genomic view track to see where we have classified divergent regions. If you find that your region of interest overlaps with a hyper-divergent region, then we recommend taking any variants as preliminary. Long-read sequencing is required to identify the actual genomic sequences in this region. ### How much confidence do we have in the indel variants? @@ -45,7 +45,7 @@ __[See our filter optimization report for further details](/static/reports/filte ### How are strains grouped by isotype? -In 2012, we [published](http://dx.doi.org/10.1038/ng.1050) genome-wide variant data from reduced representation sequencing of approximately 10% of the C. elegans genome (RAD-seq). Using these data, we grouped strains into isotypes. We also found many strains that were mislabeled as wild isolates but were instead N2 derivatives, recombinants from laboratory experiments, and mutagenesis screen isolates (detailed in Strain issues). These strains were not characterized further. For the isotypes, we chose one strain to be the isotype reference strain. This strain can be ordered through CeNDR [here]({{ url_for('strain.strain_catalog') }}). +In 2012, we [published](http://dx.doi.org/10.1038/ng.1050) genome-wide variant data from reduced representation sequencing of approximately 10% of the C. elegans genome (RAD-seq). Using these data, we grouped strains into isotypes. We also found many strains that were mislabeled as wild isolates but were instead N2 derivatives, recombinants from laboratory experiments, and mutagenesis screen isolates (detailed in [Strain issues]{{ url_for('strains.strains_issues') }}). These strains were not characterized further. For the isotypes, we chose one strain to be the isotype reference strain. This strain can be ordered through CeNDR [here]({{ url_for('strains.strains_catalog') }}). After 2012, with advances in genome sequencing, we transitioned our sequencing to whole-genome short-read sequencing. All isotype reference strains were resequenced whole-genome. The other strains within an isotype were not, diff --git a/base/static/content/help/Variant-Browser.md b/base/static/content/help/Variant-Browser.md index 3a857c7f..3a384816 100644 --- a/base/static/content/help/Variant-Browser.md +++ b/base/static/content/help/Variant-Browser.md @@ -1,4 +1,4 @@ -# Variant Browser +# Genome Browser
Concordance analysis allows us to group strains that are genetically almost identical into an isotype. The following table summarizes the number of isotypes from previous and current releases, and the number of strains that belong to those isotypes.
++ | Isotypes | +Strains Included | +Strains with WGS data | +Strains with RAD-seq data | +
---|---|---|---|---|
Isotypes from Previous Release | +400* | +910 | +770 | +140 | +
New Isotypes from Current Release | +140 | +468 | +468 | +0 | +
Total | +540 | +1378 | +1238 | +140 | +
*Four strains were reduced to a single isotype group so this number was reduced from 403 to 400 (see below for details)
+We examined the pairwise concordance scores of all strains. Concordance values for every pair of strains were calculated as the number of shared variant sites divided by the total number of variants called for each pair. If the concordance score was more than 0.9997, the strain pair is grouped into the same isotype.
+Strain comparisons are listed in the table below. Only concordance scores > 0.999 are shown.
+ + +This release used only SNVs for isotype assignment.
Four strains (ECA2677, ECA2678, ECA2679, and ECA2686) were removed because they were frozen as “dirty” strains and have now been cleaned, frozen, and re-sequenced. Because these four strains were isotype reference strains, a new isotype reference strain was assigned. It appears that the other six strains in these isotype groups changed isotypes, but they remain in the same group as before with a new, clean isotype reference strain. Details can be found below.
Dirty Strain (Old) | +Clean Strain (New) | +Previous Isotype | +New Isotype | +Other Strains in Isotype Group | +
---|---|---|---|---|
ECA2677 | +ECA1202 | +ECA2677 | +ECA1202 | +ECA1201 | +
ECA2678 | +ECA1206 | +ECA2678 | +ECA1206 | +ECA1973, ECA1979, ECA1983 | +
ECA2679 | +ECA1211 | +ECA2679 | +ECA1212 | +ECA1209, ECA1211 | +
ECA2686 | +NA* | +ECA2686 | +ECA1243 | +NA | +
*Clean strain for ECA2686 is ECA2803 but has not been sequenced yet.
+Each bar shows counts of variant sites removed by the combinations of filters indicated by the dots below.
+Variant counts for each strain based on vcf containing all variant sites called by GATK (“soft-filter.vcf”).
+ + +Variant count for each strain based on the VCF containing only sites pass all filters (“hard-filter.vcf”). All heterozygous sites on main chromosomes were converted to either homozygous or missing. The remaining heterozygous sites are all located on mitochondria chromosomes.
+ + + + + +Number of variants versus the number of heterozygous calls shows strains that might have mixed genotypes or low quality calls (high het but low variation).
+ + +