Skip to content

Commit 56e27d6

Browse files
author
Maria Simbirsky
committed
Add dnanexus applets pertaining to scaffolding pipeline
1 parent e0197f2 commit 56e27d6

52 files changed

Lines changed: 1279 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.project

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>vgp-assembly</name>
4+
<comment></comment>
5+
<projects>
6+
</projects>
7+
<buildSpec>
8+
<buildCommand>
9+
<name>org.python.pydev.PyDevBuilder</name>
10+
<arguments>
11+
</arguments>
12+
</buildCommand>
13+
</buildSpec>
14+
<natures>
15+
<nature>org.python.pydev.pythonNature</nature>
16+
</natures>
17+
</projectDescription>

.pydevproject

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<?eclipse-pydev version="1.0"?><pydev_project>
3+
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
4+
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python interpreter</pydev_property>
5+
</pydev_project>
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"name": "bionano_fa2cmap",
3+
"title": "Bionano FASTA to CMAP",
4+
"version": "0.0.1",
5+
"inputSpec": [
6+
{
7+
"name": "ngs_fasta",
8+
"class": "file",
9+
"patterns": ["*.fasta", "*.fasta.gz", "*.fa", "*.fa.gz"],
10+
"optional": false,
11+
"label": "FASTA file from NGS",
12+
"help": "An assembly produced from sequencing technologies such as PacBio or Illumina"
13+
},
14+
{
15+
"name": "enzyme_name",
16+
"label": "Bionano Enzyme Name",
17+
"class": "string",
18+
"optional": false,
19+
"choices": [
20+
"BSSSI",
21+
"BSPQI",
22+
"BBVCI",
23+
"BSMI",
24+
"BSRDI",
25+
"BSECI",
26+
"DLE1"
27+
]
28+
},
29+
{
30+
"name": "channel_num",
31+
"label": "Bionano Channel Number",
32+
"class": "int",
33+
"optional": false,
34+
"default": 1
35+
}
36+
],
37+
"outputSpec": [
38+
{
39+
"name": "reference_cmap",
40+
"class": "file",
41+
"patterns": ["*.cmap"],
42+
"label": "Reference CMAP"
43+
}
44+
],
45+
"runSpec":
46+
{
47+
"file": "src/bionano_fa2cmap.py",
48+
"interpreter": "python2.7",
49+
"distribution": "Ubuntu",
50+
"release": "14.04"
51+
},
52+
"regionalOptions": {
53+
"aws:us-east-1": {"systemRequirements": {"*": {"instanceType": "mem1_ssd1_x32"}},
54+
"assetDepends": [{"id": "record-FGVz00Q0Q1zB3fPk3JqVfKzZ"} ]},
55+
"azure:westus": {"systemRequirements": {"*": {"instanceType": "azure:mem4_ssd1_x32"}},
56+
"assetDepends": [{"id": "record-FGXxb8Q9Z8XYPXBv9b3q56kF"} ]}
57+
}
58+
}
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
from __future__ import print_function
2+
import re
3+
import os
4+
import sys
5+
import tempfile
6+
import subprocess
7+
import multiprocessing
8+
import multiprocessing.pool
9+
10+
import dxpy
11+
12+
def normalize_timedelta(timedelta):
13+
"""
14+
Given a string like "1w" or "5d", convert it to an integer in seconds.
15+
Integers without a suffix are interpreted as seconds.
16+
Note: not related to the datetime timedelta class.
17+
"""
18+
try:
19+
return int(timedelta)
20+
except ValueError as e:
21+
suffix_multipliers = {'s': 1, 'm': 60, 'h': 60*60, 'd': 60*60*24, 'w': 60*60*24*7,
22+
'M': 60*60*24*30, 'y': 60*60*24*365}
23+
24+
re_match = re.match('([0-9]+)([\s]*)([a-zA-Z]*)', timedelta)
25+
if re_match is None:
26+
msg = 'Could not parse time delta {0}'.format(timedelta)
27+
raise dxpy.AppInternalError(msg)
28+
29+
t, space, suffix = re_match.groups()
30+
31+
if suffix in suffix_multipliers:
32+
normalized_time = int(t) * suffix_multipliers[suffix]
33+
elif suffix not in suffix_multipliers and len(suffix) > 1 and suffix[0] in suffix_multipliers:
34+
old_suffix = suffix
35+
suffix = suffix[0]
36+
print('Not familiar with suffix {0}. Assuming you meant {1}.'.format(old_suffix, suffix), file=sys.stderr)
37+
normalized_time = int(t) * suffix_multipliers[suffix]
38+
else:
39+
msg = 'Valid suffixes for time duration are {0}.'.format(str(suffix_multipliers.keys()))
40+
raise dxpy.AppInternalError(msg)
41+
42+
return normalized_time
43+
44+
45+
def _eap_wrapper(func, q, *args, **kwargs):
46+
try:
47+
result = func(*args, **kwargs)
48+
except Exception as e:
49+
q.put(e.__str__())
50+
pass
51+
52+
return result
53+
54+
55+
def get_project(project_name):
56+
'''Try to find the project with the given name or id.'''
57+
58+
# First, see if the project is a project-id.
59+
try:
60+
project = dxpy.DXProject(project_name)
61+
return project
62+
except dxpy.DXError:
63+
pass
64+
65+
project = dxpy.find_projects(name=project_name, name_mode='glob', return_handler=True, level="VIEW")
66+
project = [p for p in project]
67+
if len(project) < 1:
68+
print('Did not find project {0}.'.format(project_name), file=sys.stderr)
69+
sys.exit(1)
70+
elif len(project) > 1:
71+
print('Found more than 1 project matching {0}.'.format(project_name), file=sys.stderr)
72+
sys.exit(1)
73+
else:
74+
project = project[0]
75+
76+
return project
77+
78+
79+
class ExceptionAwarePool(multiprocessing.pool.Pool):
80+
def __init__(self, *args, **kwargs):
81+
super(ExceptionAwarePool, self).__init__(*args, **kwargs)
82+
self.manager = multiprocessing.Manager()
83+
self.q = self.manager.Queue()
84+
85+
def apply_async(self, func, args=(), kwds={}, callback=None):
86+
return super(ExceptionAwarePool, self).apply_async(_eap_wrapper, args=(func, self.q) + args, kwds=kwds, callback=callback)
87+
88+
def join(self):
89+
super(ExceptionAwarePool, self).join()
90+
if self.q.qsize() > 0:
91+
raise Exception('\n'.join([self.q.get() for i in xrange(self.q.qsize())]))
92+
93+
94+
def get_memory(suffix='M'):
95+
if suffix == 'K':
96+
shift = 1
97+
elif suffix == 'M':
98+
shift = 1 << 10
99+
elif suffix == 'G':
100+
shift = 1 << 20
101+
else:
102+
raise dxpy.DXError('Unknown memory suffix {0}. Please choose from K, M, or G.'.format(suffix))
103+
104+
# Calc amount of memory available for gatk and Picard.
105+
total_mem = re.findall('^MemTotal:[\s]*([0-9]*) kB',
106+
open('/proc/meminfo').read())
107+
if(len(total_mem) != 1):
108+
raise dxpy.DXError('Problem reading system memory from /proc/meminfo')
109+
return float(total_mem[0]) / shift
110+
111+
112+
def run_cmd(cmd, returnOutput=False):
113+
if type(cmd) is list:
114+
shell=False
115+
executable = None
116+
print(subprocess.list2cmdline(cmd), file=sys.stderr)
117+
else:
118+
shell=True
119+
executable = '/bin/bash'
120+
print(cmd, file=sys.stderr)
121+
122+
if returnOutput:
123+
output = subprocess.check_output(cmd, shell=shell, executable=executable).strip()
124+
print(output)
125+
return output
126+
else:
127+
subprocess.check_call(cmd, shell=shell, executable=executable)
128+
129+
130+
class cd:
131+
'''From: http://stackoverflow.com/questions/431684/how-do-i-cd-in-python
132+
Context manager for changing the current working directory'''
133+
def __init__(self, newPath=None, tempDir=None):
134+
if newPath is not None:
135+
self.newPath = newPath
136+
self.removeFolder = False
137+
else:
138+
self.newPath = tempfile.mkdtemp(dir=tempDir)
139+
self.removeFolder = True
140+
141+
def __enter__(self):
142+
self.savedPath = os.getcwd()
143+
os.chdir(self.newPath)
144+
145+
def __exit__(self, etype, value, traceback):
146+
os.chdir(self.savedPath)
147+
if self.removeFolder:
148+
subprocess.check_call('rm -rf {0}'.format(self.newPath), shell=True)
149+
150+
151+
def schedule_lpt(jobs, num_bins):
152+
'''This function implements the Longest Processing Time algorithm to get
153+
a good division of labor for the multiprocessor scheduling problem.'''
154+
155+
def _index_min(values):
156+
# Efficient index of min from stackoverflow.com/questions/2474015
157+
return min(xrange(len(values)), key=values.__getitem__)
158+
159+
# We expect a list of tuples, with the first value the name of the
160+
# job and the second value the weight. If we are given a dict
161+
# then convert keys to job names and values to weights.
162+
if(type(jobs) == dict):
163+
jobs = zip(jobs.keys(), jobs.values())
164+
165+
num_bins = min(num_bins, len(jobs))
166+
jobs.sort(key=lambda j: j[1], reverse=True)
167+
partition = {'groups': [[] for i in xrange(num_bins)],
168+
'size': [0 for i in xrange(num_bins)]}
169+
170+
for job in jobs:
171+
idx = _index_min(partition['size'])
172+
partition['groups'][idx] += [job[0]]
173+
partition['size'][idx] += job[1]
174+
175+
return partition['groups']
176+
177+
178+
def gzip_and_upload(fn, rfn=None):
179+
if rfn is None:
180+
rfn = os.path.split(fn)[-1]
181+
cmd = 'gzip --fast -c {0} | dx upload --brief --path {1}.gz -'.format(fn, rfn)
182+
print(cmd)
183+
fid = subprocess.check_output(cmd, shell=True).strip()
184+
185+
return dxpy.dxlink(fid)
186+
187+
188+
def tar_files_and_upload(filenames, prefix):
189+
with tempfile.NamedTemporaryFile(delete=False) as fh:
190+
fh.write('\n'.join(filenames))
191+
192+
ofn = '{0}.tar.gz'.format(prefix)
193+
cmd = 'tar cvf - --files-from {0} | gzip --fast | dx upload --brief --destination "{1}" - '.format(fh.name, ofn)
194+
fid = run_cmd(cmd, returnOutput=True)
195+
cmd = ['rm', fh.name]
196+
run_cmd(cmd)
197+
198+
return dxpy.dxlink(fid)
199+
200+
def remove_special_chars(string):
201+
'''function that replaces any characters in a string that are not
202+
alphanumeric or _ or . so that they do not cause issues in commands'''
203+
string = "".join(
204+
char for char in string if char.isalnum() or char in ['_', '.'])
205+
206+
return string
207+
208+
def download_and_gunzip_file(input_file, skip_decompress=False, additional_pipe=None, create_named_pipe=False, input_filename=None):
209+
input_file = dxpy.DXFile(input_file)
210+
if input_filename is None:
211+
input_filename = input_file.describe()['name']
212+
ofn = remove_special_chars(input_filename)
213+
214+
cmd = 'dx download ' + input_file.get_id() + ' -o - '
215+
if input_filename.endswith('.tar.gz'):
216+
ofn = 'tar_output_{0}'.format(ofn.replace('.tar.gz', ''))
217+
cmd += '| tar -zxvf - '
218+
elif input_filename.endswith('.tar.bz2'):
219+
ofn = 'tar_output_{0}'.format(ofn.replace('.tar.bz2', ''))
220+
cmd += '| tar -jxvf - '
221+
elif input_filename.endswith('.tar'):
222+
ofn = 'tar_output_{0}'.format(ofn.replace('.tar', ''))
223+
cmd += '| tar -xvf - '
224+
elif (os.path.splitext(input_filename)[-1] == '.gz') and not skip_decompress:
225+
cmd += '| gunzip '
226+
ofn = os.path.splitext(ofn)[0]
227+
if additional_pipe is not None:
228+
cmd += '| ' + additional_pipe
229+
cmd += ' > "{0}"'.format(ofn)
230+
231+
if create_named_pipe:
232+
named_pipe_cmd = 'mkfifo {0}'.format(ofn)
233+
run_cmd(named_pipe_cmd)
234+
cmd += '&'
235+
236+
run_cmd(cmd)
237+
238+
return ofn

0 commit comments

Comments
 (0)