forked from maximilianh/crisporWebsite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeGenomeInfo
executable file
·60 lines (50 loc) · 1.86 KB
/
makeGenomeInfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
from collections import namedtuple
import os, logging
from os.path import join, isfile
# go through all subdirectories and collect info from all genomeInfo.tab file and write to genomeInfo.all.tab
# subdirectory with genome info
genomesDir = "genomes"
def lineFileNext(fh):
"""
parses tab-sep file with headers as field names
yields collection.namedtuples
strips "#"-prefix from header line
"""
line1 = fh.readline()
line1 = line1.strip("\n").strip("#")
headers = line1.split("\t")
Record = namedtuple('tsvRec', headers)
for line in fh:
line = line.rstrip("\n")
fields = line.split("\t")
try:
rec = Record(*fields)
except Exception, msg:
logging.error("Exception occured while parsing line, %s" % msg)
logging.error("Filename %s" % fh.name)
logging.error("Line was: %s" % repr(line))
logging.error("Does number of fields match headers?")
logging.error("Headers are: %s" % headers)
#raise Exception("wrong field count in line %s" % line)
continue
# convert fields to correct data type
yield rec
fields = ["name","description", "genome", "scientificName", "url", "server"]
tmpFname = join(genomesDir, "genomeInfo.all.tab.tmp")
outFname = join(genomesDir, "genomeInfo.all.tab")
ofh = open(tmpFname, "w")
ofh.write("\t".join(fields)+"\n")
for subDir in os.listdir(genomesDir):
infoFname = join(genomesDir, subDir, "genomeInfo.tab")
if not isfile(infoFname):
continue
for row in lineFileNext(open(infoFname)):
newRow = []
rowDict = row._asdict()
for field in fields:
newRow.append(rowDict.get(field, ""))
ofh.write("\t".join(newRow)+"\n")
ofh.close()
os.rename(tmpFname, outFname)
print("Recreated %s" % outFname)