-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6fec56a
commit f755c1d
Showing
19 changed files
with
157 additions
and
6,139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[flake8] | ||
ignore = E226 | ||
max-line-length = 120 | ||
exclude = tests/* | ||
max-complexity = 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,33 +2,70 @@ | |
""" | ||
This page is intended to store application constants that change | ||
very infrequently (if ever). | ||
very infrequently (if ever). | ||
Author: Daniel E. Cook ([email protected]) | ||
""" | ||
|
||
|
||
# PRICES | ||
class PRICES: | ||
DIVERGENT_SET = 160 | ||
STRAIN_SET = 640 | ||
STRAIN = 15 | ||
SHIPPING = 65 | ||
|
||
|
||
# BUILDS AND RELEASES | ||
WORMBASE_BUILD = "WS261" | ||
RELEASES = ["20170531", | ||
"20160408"] | ||
CURRENT_RELEASE = RELEASES[0] | ||
|
||
|
||
# URLS | ||
BAM_URL_PREFIX = "https://elegansvariation.org.s3.amazonaws.com/bam" | ||
|
||
# Maps chromosome in roman numerals to integer | ||
CHROM_NUMERIC = {"I": 1, | ||
"II": 2, | ||
"III": 3, | ||
"IV": 4, | ||
"V": 5, | ||
"X": 6, | ||
"MtDNA": 7} | ||
"MtDNA": 7} | ||
|
||
|
||
class URLS: | ||
""" | ||
URLs are stored here so they can be easily integrated into the database | ||
for provenance purposes. | ||
""" | ||
|
||
# | ||
# AWS URLS | ||
# | ||
BAM_URL_PREFIX = "https://elegansvariation.org.s3.amazonaws.com/bam" | ||
|
||
# | ||
# Wormbase URLs | ||
# | ||
|
||
# Gene GTF | ||
GENE_GTF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.canonical_geneset.gtf.gz" | ||
|
||
# GENE GFF_URL | ||
GENE_GFF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.annotations.gff3.gz" | ||
|
||
# Maps wormbase ID to locus name | ||
GENE_IDS_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz" | ||
|
||
# Lists C. elegans orthologs | ||
ORTHOLOG_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/PRJNA13758/annotation/orthologs/c_elegans.PRJNA13758.current_development.orthologs.txt" | ||
|
||
# | ||
# Ortholog URLs | ||
# | ||
|
||
# Homologene | ||
HOMOLOGENE_URL = 'https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data' | ||
|
||
# Taxon IDs | ||
TAXON_ID_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
""" | ||
Functions in this script are used to load | ||
information from wormbase into the | ||
information from wormbase into the | ||
CeNDR database | ||
Author: Daniel E. Cook ([email protected]) | ||
|
@@ -11,25 +11,11 @@ | |
import csv | ||
import gzip | ||
from gtfparse import read_gtf_as_dataframe | ||
from urllib.request import urlretrieve, urlopen | ||
from urllib.request import urlretrieve | ||
from tempfile import NamedTemporaryFile | ||
from base.constants import WORMBASE_BUILD, CHROM_NUMERIC | ||
from base.utils.genetic_utils import arm_or_center | ||
from base.models2 import wormbase_gene_summary_m | ||
from base.constants import URLS, CHROM_NUMERIC | ||
|
||
# Gene GTF defines biotype, start, stop, etc. | ||
# The GTF does not include locus names (pot-2, etc), so we download them in the get_gene_ids function. | ||
GENE_GTF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.canonical_geneset.gtf.gz" | ||
|
||
|
||
# GENE GFF_URL | ||
GENE_GFF_URL = f"ftp://ftp.wormbase.org/pub/wormbase/releases/{WORMBASE_BUILD}/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.{WORMBASE_BUILD}.annotations.gff3.gz" | ||
|
||
# Maps wormbase ID to locus name | ||
GENE_IDS_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz" | ||
|
||
# Lists C. elegans orthologs | ||
ORTHOLOG_URL = f"ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/PRJNA13758/annotation/orthologs/c_elegans.PRJNA13758.current_development.orthologs.txt" | ||
|
||
def get_gene_ids(): | ||
""" | ||
|
@@ -38,10 +24,9 @@ def get_gene_ids(): | |
Gene locus names (e.g. pot-2) | ||
""" | ||
gene_locus_names_file = NamedTemporaryFile('wb', suffix=".gz") | ||
out, err = urlretrieve(GENE_IDS_URL, gene_locus_names_file.name) | ||
return dict([x.split(",")[1:3] for x in gzip.open(out, 'r').read().decode('utf-8').splitlines()]) | ||
|
||
|
||
out, err = urlretrieve(URLS.GENE_IDS_URL, gene_locus_names_file.name) | ||
results = [x.split(",")[1:3] for x in gzip.open(out, 'r').read().decode('utf-8').splitlines()] | ||
return dict(results) | ||
|
||
|
||
def fetch_gene_gtf(): | ||
|
@@ -51,16 +36,16 @@ def fetch_gene_gtf(): | |
and yields a dictionary for each row. | ||
""" | ||
gene_gtf_file = NamedTemporaryFile('wb', suffix=".gz") | ||
out, err = urlretrieve(GENE_GTF_URL, gene_gtf_file.name) | ||
out, err = urlretrieve(URLS.GENE_GTF_URL, gene_gtf_file.name) | ||
gene_gtf = read_gtf_as_dataframe(gene_gtf_file.name) | ||
|
||
gene_ids = get_gene_ids() | ||
# Add locus column | ||
# Rename seqname to chrom | ||
gene_gtf = gene_gtf.rename({'seqname':'chrom'}, axis='columns') | ||
gene_gtf = gene_gtf.rename({'seqname': 'chrom'}, axis='columns') | ||
gene_gtf = gene_gtf.assign(locus=[gene_ids.get(x) for x in gene_gtf.gene_id]) | ||
gene_gtf = gene_gtf.assign(chrom_num=[CHROM_NUMERIC[x] for x in gene_gtf.chrom]) | ||
gene_gtf = gene_gtf.assign(pos = (((gene_gtf.end - gene_gtf.start)/2) + gene_gtf.start).map(int)) | ||
gene_gtf = gene_gtf.assign(pos=(((gene_gtf.end - gene_gtf.start)/2) + gene_gtf.start).map(int)) | ||
gene_gtf['arm_or_center'] = gene_gtf.apply(lambda row: arm_or_center(row['chrom'], row['pos']), axis=1) | ||
for row in gene_gtf.to_dict('records'): | ||
yield row | ||
|
@@ -75,7 +60,7 @@ def fetch_gene_gff_summary(): | |
""" | ||
|
||
gene_gff_file = NamedTemporaryFile('wb', suffix=".gz") | ||
out, err = urlretrieve(GENE_GFF_URL, gene_gff_file.name) | ||
out, err = urlretrieve(URLS.GENE_GFF_URL, gene_gff_file.name) | ||
|
||
WB_GENE_FIELDSET = ['ID', 'biotype', 'sequence_name', 'chrom', 'start', 'end', 'locus'] | ||
|
||
|
@@ -88,7 +73,7 @@ def fetch_gene_gff_summary(): | |
gene.update(zip(["chrom", "start", "end"], | ||
[line[0], line[3], line[4]])) | ||
gene = {k.lower(): v for k, v in gene.items() if k in WB_GENE_FIELDSET} | ||
|
||
# Change add chrom_num | ||
gene['chrom_num'] = CHROM_NUMERIC[gene['chrom']] | ||
gene['start'] = int(gene['start']) | ||
|
@@ -110,7 +95,7 @@ def fetch_orthologs(): | |
Fetches orthologs from wormbase; Stored in the homolog table. | ||
""" | ||
orthologs_file = NamedTemporaryFile('wb', suffix=".txt") | ||
out, err = urlretrieve(ORTHOLOG_URL , orthologs_file.name) | ||
out, err = urlretrieve(URLS.ORTHOLOG_URL, orthologs_file.name) | ||
csv_out = list(csv.reader(open(out, 'r'), delimiter='\t')) | ||
|
||
for line in csv_out: | ||
|
@@ -126,4 +111,4 @@ def fetch_orthologs(): | |
'homolog_taxon_id': None, | ||
'homolog_gene': line[2], | ||
'homolog_source': line[3], | ||
'is_ortholog': line[0] == 'Caenorhabditis elegans'} | ||
'is_ortholog': line[0] == 'Caenorhabditis elegans'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.