Skip to content

Commit

Permalink
Add metadata table and update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecook committed Jan 17, 2018
1 parent 6fec56a commit f755c1d
Show file tree
Hide file tree
Showing 19 changed files with 157 additions and 6,139 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E226
max-line-length = 120
exclude = tests/*
max-complexity = 10
2 changes: 1 addition & 1 deletion base/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,4 +219,4 @@ def internal_server_error(e):
from base.task import *
from base.views import *
from base.views.api import *
from base.manage import (initdb)
from base.manage import (init_db)
47 changes: 42 additions & 5 deletions base/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,70 @@
"""
This page is intended to store application constants that change
very infrequently (if ever).
very infrequently (if ever).
Author: Daniel E. Cook ([email protected])
"""


# PRICES
class PRICES:
DIVERGENT_SET = 160
STRAIN_SET = 640
STRAIN = 15
SHIPPING = 65


# BUILDS AND RELEASES
WORMBASE_BUILD = "WS261"
RELEASES = ["20170531",
"20160408"]
CURRENT_RELEASE = RELEASES[0]


# URLS
BAM_URL_PREFIX = "https://elegansvariation.org.s3.amazonaws.com/bam"

# Maps chromosome in roman numerals to integer
CHROM_NUMERIC = {"I": 1,
"II": 2,
"III": 3,
"IV": 4,
"V": 5,
"X": 6,
"MtDNA": 7}
"MtDNA": 7}


class URLS:
"""
URLs are stored here so they can be easily integrated into the database
for provenance purposes.
"""

#
# AWS URLS
#
BAM_URL_PREFIX = "https://elegansvariation.org.s3.amazonaws.com/bam"

#
# Wormbase URLs
#

# Gene GTF
GENE_GTF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.canonical_geneset.gtf.gz"

# GENE GFF_URL
GENE_GFF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.annotations.gff3.gz"

# Maps wormbase ID to locus name
GENE_IDS_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz"

# Lists C. elegans orthologs
ORTHOLOG_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/PRJNA13758/annotation/orthologs/c_elegans.PRJNA13758.current_development.orthologs.txt"

#
# Ortholog URLs
#

# Homologene
HOMOLOGENE_URL = 'https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data'

# Taxon IDs
TAXON_ID_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
29 changes: 6 additions & 23 deletions base/db/etl_homologene.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,24 @@
import tarfile
import csv
import requests
from base.db.etl_wormbase import get_gene_ids
from urllib.request import urlretrieve, urlopen
from urllib.request import urlretrieve
from tempfile import NamedTemporaryFile
from base.models2 import wormbase_gene_summary_m


# Homologene database
HOMOLOGENE_URL = 'https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data'
TAXON_ID_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
from base.constants import URLS


def fetch_taxon_ids():
"""
Downloads mapping of taxon-ids to species names.
"""
taxon_file = NamedTemporaryFile(suffix='tar')
out, err = urlretrieve(TAXON_ID_URL, taxon_file.name)
out, err = urlretrieve(URLS.TAXON_ID_URL, taxon_file.name)
tar = tarfile.open(out)
# Read names file
names_dmp = tar.extractfile('names.dmp')
names_dmp = names_dmp.read().decode('utf-8').splitlines()
lines = [re.split('\t\|[\t]?', x) for x in names_dmp]
taxon_ids = {int(l[0]):l[1] for l in lines if l[3] == 'scientific name'}
taxon_ids = {int(l[0]): l[1] for l in lines if l[3] == 'scientific name'}
return taxon_ids


Expand All @@ -58,21 +53,10 @@ def fetch_homologene():
* gene_symbol = Gene Symbol
* species = species name
"""
response = requests.get(HOMOLOGENE_URL)
response = requests.get(URLS.HOMOLOGENE_URL)
response_csv = list(csv.reader(response.content.decode('utf-8').splitlines(), delimiter='\t'))

taxon_ids = fetch_taxon_ids()

# In this loop we add the homologene id (line[0]) if there's a c_elegans gene
# (line[1]) in the group.
fields = ['homologene',
'taxon_id',
'gene_id',
'gene_symbol',
'protein_gi',
'protein_accession',
'species',
'locus_name']
taxon_ids = fetch_taxon_ids()

# First, fetch records with a homolog ID that possesses a C. elegans gene.
elegans_set = dict([[int(x[0]), x[3]] for x in response_csv if x[1] == '6239'])
Expand All @@ -91,4 +75,3 @@ def fetch_homologene():
'homolog_gene': line[3],
'homolog_source': "Homologene",
'is_ortholog': False}

18 changes: 12 additions & 6 deletions base/db/etl_strains.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@
from base.utils.google_sheets import get_google_sheet
from base.utils.gcloud import get_item
from logzero import logger
ELEVATION_API_KEY=elevation_api_key = get_item('credential', 'elevation').get('apiKey')

# Not a constant!
ELEVATION_API_KEY = get_item('credential', 'elevation').get('apiKey')


@lru_cache(maxsize=32)
def fetch_elevations(lat, lon):
"""
Fetch elevation
Fetch elevation.
@lru_cache decorator caches result so we only make one
call per unique lat/lon result.
"""
result = requests.get(f"https://maps.googleapis.com/maps/api/elevation/json?locations={lat},{lon}&key={ELEVATION_API_KEY}")
elevation_url = f"https://maps.googleapis.com/maps/api/elevation/json?locations={lat},{lon}&key={ELEVATION_API_KEY}"
result = requests.get(elevation_url)
if result.ok:
elevation = result.json()['results'][0]['elevation']
return elevation
Expand All @@ -41,10 +48,9 @@ def fetch_andersen_strains():
strain_records = WI.get_all_records()
strain_records = list(filter(lambda x: x.get('isotype') not in ['', None, 'NA'], strain_records))
for n, record in enumerate(strain_records):
set_list = ','.join([set_name.split("_")[1] for set_name, set_val in record.items()
set_list = ','.join([set_name.split("_")[1] for set_name, set_val in record.items()
if 'set_' in set_name and set_val == "TRUE"])
record['sets'] = set_list
latlon_set = []
for k, v in record.items():
# Set NA to None
if v in ["NA", '']:
Expand All @@ -56,4 +62,4 @@ def fetch_andersen_strains():
record['elevation'] = fetch_elevations(record['latitude'], record['longitude'])
if n % 50 == 0:
logger.info(f"Loaded {n} strains")
return strain_records
return strain_records
41 changes: 13 additions & 28 deletions base/db/etl_wormbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
Functions in this script are used to load
information from wormbase into the
information from wormbase into the
CeNDR database
Author: Daniel E. Cook ([email protected])
Expand All @@ -11,25 +11,11 @@
import csv
import gzip
from gtfparse import read_gtf_as_dataframe
from urllib.request import urlretrieve, urlopen
from urllib.request import urlretrieve
from tempfile import NamedTemporaryFile
from base.constants import WORMBASE_BUILD, CHROM_NUMERIC
from base.utils.genetic_utils import arm_or_center
from base.models2 import wormbase_gene_summary_m
from base.constants import URLS, CHROM_NUMERIC

# Gene GTF defines biotype, start, stop, etc.
# The GTF does not include locus names (pot-2, etc), so we download them in the get_gene_ids function.
GENE_GTF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.canonical_geneset.gtf.gz"


# GENE GFF_URL
GENE_GFF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.annotations.gff3.gz"

# Maps wormbase ID to locus name
GENE_IDS_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz"

# Lists C. elegans orthologs
ORTHOLOG_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/PRJNA13758/annotation/orthologs/c_elegans.PRJNA13758.current_development.orthologs.txt"

def get_gene_ids():
"""
Expand All @@ -38,10 +24,9 @@ def get_gene_ids():
Gene locus names (e.g. pot-2)
"""
gene_locus_names_file = NamedTemporaryFile('wb', suffix=".gz")
out, err = urlretrieve(GENE_IDS_URL, gene_locus_names_file.name)
return dict([x.split(",")[1:3] for x in gzip.open(out, 'r').read().decode('utf-8').splitlines()])


out, err = urlretrieve(URLS.GENE_IDS_URL, gene_locus_names_file.name)
results = [x.split(",")[1:3] for x in gzip.open(out, 'r').read().decode('utf-8').splitlines()]
return dict(results)


def fetch_gene_gtf():
Expand All @@ -51,16 +36,16 @@ def fetch_gene_gtf():
and yields a dictionary for each row.
"""
gene_gtf_file = NamedTemporaryFile('wb', suffix=".gz")
out, err = urlretrieve(GENE_GTF_URL, gene_gtf_file.name)
out, err = urlretrieve(URLS.GENE_GTF_URL, gene_gtf_file.name)
gene_gtf = read_gtf_as_dataframe(gene_gtf_file.name)

gene_ids = get_gene_ids()
# Add locus column
# Rename seqname to chrom
gene_gtf = gene_gtf.rename({'seqname':'chrom'}, axis='columns')
gene_gtf = gene_gtf.rename({'seqname': 'chrom'}, axis='columns')
gene_gtf = gene_gtf.assign(locus=[gene_ids.get(x) for x in gene_gtf.gene_id])
gene_gtf = gene_gtf.assign(chrom_num=[CHROM_NUMERIC[x] for x in gene_gtf.chrom])
gene_gtf = gene_gtf.assign(pos = (((gene_gtf.end - gene_gtf.start)/2) + gene_gtf.start).map(int))
gene_gtf = gene_gtf.assign(pos=(((gene_gtf.end - gene_gtf.start)/2) + gene_gtf.start).map(int))
gene_gtf['arm_or_center'] = gene_gtf.apply(lambda row: arm_or_center(row['chrom'], row['pos']), axis=1)
for row in gene_gtf.to_dict('records'):
yield row
Expand All @@ -75,7 +60,7 @@ def fetch_gene_gff_summary():
"""

gene_gff_file = NamedTemporaryFile('wb', suffix=".gz")
out, err = urlretrieve(GENE_GFF_URL, gene_gff_file.name)
out, err = urlretrieve(URLS.GENE_GFF_URL, gene_gff_file.name)

WB_GENE_FIELDSET = ['ID', 'biotype', 'sequence_name', 'chrom', 'start', 'end', 'locus']

Expand All @@ -88,7 +73,7 @@ def fetch_gene_gff_summary():
gene.update(zip(["chrom", "start", "end"],
[line[0], line[3], line[4]]))
gene = {k.lower(): v for k, v in gene.items() if k in WB_GENE_FIELDSET}

# Change add chrom_num
gene['chrom_num'] = CHROM_NUMERIC[gene['chrom']]
gene['start'] = int(gene['start'])
Expand All @@ -110,7 +95,7 @@ def fetch_orthologs():
Fetches orthologs from wormbase; Stored in the homolog table.
"""
orthologs_file = NamedTemporaryFile('wb', suffix=".txt")
out, err = urlretrieve(ORTHOLOG_URL , orthologs_file.name)
out, err = urlretrieve(URLS.ORTHOLOG_URL, orthologs_file.name)
csv_out = list(csv.reader(open(out, 'r'), delimiter='\t'))

for line in csv_out:
Expand All @@ -126,4 +111,4 @@ def fetch_orthologs():
'homolog_taxon_id': None,
'homolog_gene': line[2],
'homolog_source': line[3],
'is_ortholog': line[0] == 'Caenorhabditis elegans'}
'is_ortholog': line[0] == 'Caenorhabditis elegans'}
45 changes: 40 additions & 5 deletions base/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@
import arrow
from click import secho
from base.application import app, db_2
from base.models2 import strain_m, wormbase_gene_summary_m, wormbase_gene_m, homologs_m
from base.models2 import (metadata_m,
strain_m,
wormbase_gene_summary_m,
wormbase_gene_m,
homologs_m)
from base.db.etl_strains import fetch_andersen_strains
from base.db.etl_wormbase import fetch_gene_gtf, fetch_gene_gff_summary, fetch_orthologs
from base.db.etl_wormbase import (fetch_gene_gtf,
fetch_gene_gff_summary,
fetch_orthologs)
from base.db.etl_homologene import fetch_homologene
from base import constants


@app.cli.command()
def initdb():
def init_db():
"""Initialize the database."""
start = arrow.utcnow()
secho('Initializing Database', fg="green")
Expand All @@ -18,6 +26,31 @@ def initdb():

secho('Created cendr.db', fg="green")

################
# Set metadata #
################
secho('Inserting metadata', fg="green")
date_created = metadata_m(key="date_created", value=arrow.utcnow().datetime.isoformat())
db_2.session.add(date_created)
for var in vars(constants):
if not var.startswith("_"):
# For nested constants:
current_var = getattr(constants, var)
if type(current_var) == type:
for name in [x for x in vars(current_var) if not x.startswith("_")]:
key_val = metadata_m(key="{}/{}".format(var, name),
value=getattr(current_var, name))
db_2.session.add(key_val)
elif type(current_var) == list:
key_val = metadata_m(key=var,
value=','.join(getattr(constants, var)))
db_2.session.add(key_val)
else:
key_val = metadata_m(key=var, value=str(getattr(constants, var)))
db_2.session.add(key_val)

db_2.session.commit()

##############
# Load Genes #
##############
Expand All @@ -39,7 +72,7 @@ def initdb():
db_2.session.bulk_insert_mappings(homologs_m, fetch_homologene())
secho('Loading orthologs from WormBase', fg='green')
db_2.session.bulk_insert_mappings(homologs_m, fetch_orthologs())

################
# Load Strains #
################
Expand All @@ -48,4 +81,6 @@ def initdb():
db_2.session.commit()
secho(f"Inserted {strain_m.query.count()} strains", fg="blue")
diff = int((arrow.utcnow() - start).total_seconds())
secho(f"{diff} seconds")
secho(f"{diff} seconds")


Loading

0 comments on commit f755c1d

Please sign in to comment.