forked from SuLab/fiSSEA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmyvariant.py
445 lines (376 loc) · 17.6 KB
/
myvariant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
'''
Python Client for MyGene.Info services
'''
from __future__ import print_function
import sys
import time
import httplib2
import json
try:
from pandas import DataFrame
df_avail = True
except:
df_avail = False
__version__ = '2.2.0'
if sys.version_info[0] == 3:
str_types = str
from urllib.parse import urlencode
else:
str_types = (str, unicode)
from urllib import urlencode
def alwayslist(value):
'''If input value if not a list/tuple type, return it as a single value list.
Example:
>>> x = 'abc'
>>> for xx in alwayslist(x):
... print xx
>>> x = ['abc', 'def']
>>> for xx in alwayslist(x):
... print xx
'''
if isinstance(value, (list, tuple)):
return value
else:
return [value]
def safe_str(s, encoding='utf-8'):
'''if input is an unicode string, do proper encoding.'''
try:
_s = str(s)
except UnicodeEncodeError:
_s = s.encode(encoding)
return _s
def list_itemcnt(list):
'''Return number of occurrence for each type of item in the list.'''
x = {}
for item in list:
if item in x:
x[item] += 1
else:
x[item] = 1
return [(i, x[i]) for i in x]
class MyVariantInfo():
'''This is the client for MyGene.info web services.
Example:
>>> mg = MyGeneInfo()
'''
def __init__(self, url='http://myvariant.info/api'):
self.url = url
if self.url[-1] == '/':
self.url = self.url[:-1]
self.h = httplib2.Http()
self.max_query = 1000
# delay and step attributes are for batch queries.
self.delay = 1
self.step = 1000
def _as_dataframe(self, gene_obj, df_index=True):
"""
converts gene object to DataFrame (pandas)
"""
if not df_avail:
print("Error: pandas module must be installed for as_dataframe option.")
return
if 'hits' in gene_obj:
df = DataFrame.from_dict(gene_obj['hits'])
else:
df = DataFrame.from_dict(gene_obj)
if df_index:
df = df.set_index('query')
return df
def _get(self, url, params={}):
debug = params.pop('debug', False)
return_raw = params.pop('return_raw', False)
headers = {'user-agent': "Python-httplib2_myvariant.py/%s (gzip)" % httplib2.__version__}
if params:
_url = url + '?' + urlencode(params)
else:
_url = url
res, con = self.h.request(_url, headers=headers)
con = con.decode("utf8") # required in python3
if debug:
return _url, res, con
assert res.status == 200, (_url, res, con)
if return_raw:
return con
else:
return json.loads(con)
def _post(self, url, params):
debug = params.pop('debug', False)
return_raw = params.pop('return_raw', False)
headers = {'content-type': 'application/x-www-form-urlencoded',
'user-agent': "Python-httplib2_myvariant.py/%s (gzip)" % httplib2.__version__}
res, con = self.h.request(url, 'POST', body=urlencode(params), headers=headers)
con = con.decode("utf8") # required in python3
if debug:
return url, res, con
assert res.status == 200, (url, res, con)
if return_raw:
return con
else:
return json.loads(con)
def _is_entrez_id(self, id):
try:
int(id)
return True
except:
return False
def _format_list(self, a_list, sep=','):
if isinstance(a_list, (list, tuple)):
_out = sep.join([safe_str(x) for x in a_list])
else:
_out = a_list # a_list is already a comma separated string
return _out
def _repeated_query(self, query_fn, query_li, verbose=True, **fn_kwargs):
step = min(self.step, self.max_query)
if len(query_li) <= step:
# No need to do series of batch queries, turn off verbose output
verbose = False
for i in range(0, len(query_li), step):
is_last_loop = i+step >= len(query_li)
if verbose:
print("querying {0}-{1}...".format(i+1, min(i+step, len(query_li))), end="")
query_result = query_fn(query_li[i:i+step], **fn_kwargs)
yield query_result
if verbose:
print("done.")
if not is_last_loop and self.delay:
time.sleep(self.delay)
@property
def metadata(self):
'''Return a dictionary of MyGene.info metadata.
Example:
>>> metadata = mg.metadata
'''
_url = self.url+'/metadata'
return self._get(_url)
def getgene(self, geneid, fields='symbol,name,taxid,entrezgene', **kwargs):
'''Return the gene object for the give geneid.
This is a wrapper for GET query of "/gene/<geneid>" service.
:param geneid: entrez/ensembl gene id, entrez gene id can be either
a string or integer
:param fields: fields to return, a list or a comma-separated string.
If **fields="all"**, all available fields are returned
:param species: optionally, you can pass comma-separated species names
or taxonomy ids
:param email: optionally, pass your email to help us to track usage
:param filter: alias for **fields** parameter
:return: a gene object as a dictionary
:ref: http://mygene.info/doc/annotation_service.html for available
fields, extra *kwargs* and more.
Example:
>>> mg.getgene(1017, email='[email protected]')
>>> mg.getgene('1017', fields='symbol,name,entrezgene,refseq')
>>> mg.getgene('1017', fields='symbol,name,entrezgene,refseq.rna')
>>> mg.getgene('1017', fields=['symbol', 'name', 'pathway.kegg'])
>>> mg.getgene('ENSG00000123374', fields='all')
.. Hint:: The supported field names passed to **fields** parameter can be found from
any full gene object (when **fields="all"**). Note that field name supports dot
notation for nested data structure as well, e.g. you can pass "refseq.rna" or
"pathway.kegg".
'''
if fields:
kwargs['fields'] = self._format_list(fields)
if 'filter' in kwargs:
kwargs['fields'] = self._format_list(kwargs['filter'])
_url = self.url + '/gene/' + str(geneid)
return self._get(_url, kwargs)
def _getvariants_inner(self, geneids, **kwargs):
_kwargs = {'ids': self._format_list(geneids)}
_kwargs.update(kwargs)
_url = self.url + '/variant'
return self._post(_url, _kwargs)
def getvariants(self, geneids, fields='symbol,name,taxid,entrezgene', **kwargs):
'''Return the list of gene objects for the given list of geneids.
This is a wrapper for POST query of "/gene" service.
:param geneids: a list or comm-sep entrez/ensembl gene ids
:param fields: fields to return, a list or a comma-separated string.
If **fields="all"**, all available fields are returned
:param species: optionally, you can pass comma-separated species names
or taxonomy ids
:param email: optionally, pass your email to help us to track usage
:param filter: alias for fields
:param as_dataframe: if True, return object as DataFrame (requires Pandas).
:param df_index: if True (default), index returned DataFrame by 'query',
otherwise, index by number. Only applicable if as_dataframe=True.
:return: a list of gene objects or a pandas DataFrame object (when **as_dataframe** is True)
:ref: http://mygene.info/doc/annotation_service.html for available
fields, extra *kwargs* and more.
Example:
>>> mg.getgenes([1017, '1018','ENSG00000148795'], email='[email protected]')
>>> mg.getgenes([1017, '1018','ENSG00000148795'], fields="entrezgene,uniprot")
>>> mg.getgenes([1017, '1018','ENSG00000148795'], fields="all")
>>> mg.getgenes([1017, '1018','ENSG00000148795'], as_dataframe=True)
.. Hint:: A large list of more than 1000 input ids will be sent to the backend
web service in batches (1000 at a time), and then the results will be
concatenated together. So, from the user-end, it's exactly the same as
passing a shorter list. You don't need to worry about saturating our
backend servers.
'''
if isinstance(geneids, str_types):
geneids = geneids.split(',')
if (not (isinstance(geneids, (list, tuple)) and len(geneids) > 0)):
raise ValueError('input "geneids" must be non-empty list or tuple.')
if fields:
kwargs['fields'] = self._format_list(fields)
if 'filter' in kwargs:
kwargs['fields'] = self._format_list(kwargs['filter'])
verbose = kwargs.pop('verbose', True)
as_dataframe = kwargs.pop('as_dataframe', False)
if as_dataframe:
df_index = kwargs.pop('df_index', True)
return_raw = kwargs.get('return_raw', False)
if return_raw:
as_dataframe = False
query_fn = lambda geneids: self._getvariants_inner(geneids, **kwargs)
out = []
for hits in self._repeated_query(query_fn, geneids, verbose=verbose):
if return_raw:
out.append(hits) # hits is the raw response text
else:
out.extend(hits)
if return_raw and len(out) == 1:
out = out[0]
if as_dataframe:
out = self._as_dataframe(out, df_index)
return out
def query_variant(self, q, **kwargs):
'''Return the query result.
This is a wrapper for GET query of "/query?q=<query>" service.
:param q: a query string, detailed query syntax `here <http://mygene.info/doc/query_service.html#query-syntax>`_
:param fields: fields to return, a list or a comma-separated string.
If **fields="all"**, all available fields are returned
:param species: optionally, you can pass comma-separated species names
or taxonomy ids. Default: human,mouse,rat.
:param size: the maximum number of results to return (with a cap
of 1000 at the moment). Default: 10.
:param skip: the number of results to skip. Default: 0.
:param sort: Prefix with "-" for descending order, otherwise in ascending order.
Default: sort by matching scores in decending order.
:param entrezonly: if True, return only matching entrez genes, otherwise, including matching
Ensemble-only genes (those have no matching entrez genes).
:param email: optionally, pass your email to help us to track usage
:param as_dataframe: if True, return object as DataFrame (requires Pandas).
:param df_index: if True (default), index returned DataFrame by 'query',
otherwise, index by number. Only applicable if as_dataframe=True.
:return: a dictionary with returned gene hits or a pandas DataFrame object (when **as_dataframe** is True)
:ref: http://mygene.info/doc/query_service.html for available
fields, extra *kwargs* and more.
Example:
>>> mg.query('cdk2')
>>> mg.query('reporter:1000_at')
>>> mg.query('symbol:cdk2', species='human')
>>> mg.query('symbol:cdk*', species=10090, size=5, as_dataframe=True)
>>> mg.query('q=chrX:151073054-151383976', species=9606)
'''
as_dataframe = kwargs.pop('as_dataframe', False)
kwargs.update({'q': q})
_url = self.url + '/query'
out = self._get(_url, kwargs)
if as_dataframe:
out = self._as_dataframe(out, False)
return out
def _querymany_inner(self, qterms, **kwargs):
_kwargs = {'q': self._format_list(qterms)}
_kwargs.update(kwargs)
_url = self.url + '/query'
return self._post(_url, _kwargs)
def querymany(self, qterms, scopes=None, **kwargs):
'''Return the batch query result.
This is a wrapper for POST query of "/query" service.
:param qterms: a list of query terms, or a string of comma-separated query terms.
:param scopes: type of types of identifiers, either a list or a comma-separated fields to specify type of
input qterms, e.g. "entrezgene", "entrezgene,symbol", ["ensemblgene", "symbol"]
refer to "http://mygene.info/doc/query_service.html#available_fields" for full list
of fields.
:param fields: fields to return, a list or a comma-separated string.
If **fields="all"**, all available fields are returned
:param species: optionally, you can pass comma-separated species names
or taxonomy ids. Default: human,mouse,rat.
:param entrezonly: if True, return only matching entrez genes, otherwise, including matching
Ensemble-only genes (those have no matching entrez genes).
:param returnall: if True, return a dict of all related data, including dup. and missing qterms
:param verbose: if True (default), print out infomation about dup and missing qterms
:param email: optionally, pass your email to help us to track usage
:param as_dataframe: if True, return object as DataFrame (requires Pandas).
:param df_index: if True (default), index returned DataFrame by 'query',
otherwise, index by number. Only applicable if as_dataframe=True.
:return: a list of gene objects or a pandas DataFrame object (when **as_dataframe** is True)
:ref: http://mygene.info/doc/query_service.html for available
fields, extra *kwargs* and more.
Example:
>>> mg.querymany(['DDX26B', 'CCDC83'], scopes='symbol', species=9606)
>>> mg.querymany(['1255_g_at', '1294_at', '1316_at', '1320_at'], scopes='reporter')
>>> mg.querymany(['NM_003466', 'CDK2', 695, '1320_at', 'Q08345'],
... scopes='refseq,symbol,entrezgene,reporter,uniprot', species='human')
>>> mg.querymany(['1255_g_at', '1294_at', '1316_at', '1320_at'], scopes='reporter',
... fields='ensembl.gene,symbol', as_dataframe=True)
.. Hint:: :py:meth:`querymany` is perfect for doing id mappings.
.. Hint:: Just like :py:meth:`getgenes`, passing a large list of ids (>1000) to :py:meth:`querymany` is perfectly fine.
'''
if isinstance(qterms, str_types):
qterms = qterms.split(',')
if (not (isinstance(qterms, (list, tuple)) and len(qterms) > 0)):
raise ValueError('input "qterms" must be non-empty list or tuple.')
if scopes:
kwargs['scopes'] = self._format_list(scopes)
if 'scope' in kwargs:
# allow scope for back-compatibility
kwargs['scopes'] = self._format_list(kwargs['scope'])
if 'fields' in kwargs:
kwargs['fields'] = self._format_list(kwargs['fields'])
if 'species' in kwargs:
kwargs['species'] = self._format_list(kwargs['species'])
returnall = kwargs.pop('returnall', False)
verbose = kwargs.pop('verbose', True)
as_dataframe = kwargs.pop('as_dataframe', False)
if as_dataframe:
df_index = kwargs.pop('df_index', True)
return_raw = kwargs.get('return_raw', False)
if return_raw:
as_dataframe = False
out = []
li_missing = []
li_dup = []
li_query = []
query_fn = lambda qterms: self._querymany_inner(qterms, **kwargs)
for hits in self._repeated_query(query_fn, qterms, verbose=verbose):
if return_raw:
out.append(hits) # hits is the raw response text
else:
out.extend(hits)
for hit in hits:
if hit.get('notfound', False):
li_missing.append(hit['query'])
else:
li_query.append(hit['query'])
if verbose:
print("Finished.")
if return_raw:
if len(out) == 1:
out = out[0]
return out
if as_dataframe:
out = self._as_dataframe(out, df_index)
# check dup hits
if li_query:
li_dup = [(query, cnt) for query, cnt in list_itemcnt(li_query) if cnt > 1]
del li_query
if verbose:
if li_dup:
print("{0} input query terms found dup hits:".format(len(li_dup)))
print("\t"+str(li_dup)[:100])
if li_missing:
print("{0} input query terms found no hit:".format(len(li_missing)))
print("\t"+str(li_missing)[:100])
if returnall:
return {'out': out, 'dup': li_dup, 'missing': li_missing}
else:
if verbose and (li_dup or li_missing):
print('Pass "returnall=True" to return complete lists of duplicate or missing query terms.')
return out
def findgenes(self, id_li, **kwargs):
'''.. deprecated:: 2.0.0
Use :py:meth:`querymany` instead. It's kept here as an alias of :py:meth:`querymany` method.
'''
import warnings
warnings.warn('Deprecated! Currently an alias of "querymany" method. Use "querymany" method directly.', DeprecationWarning)
return self.querymany(id_li, **kwargs)