-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathPEsortedSAM2readprofile.py
executable file
·113 lines (105 loc) · 3.82 KB
/
PEsortedSAM2readprofile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
import sys
import pkg_resources
pkg_resources.require( "bx-python" )
import bx.seq.twobit
import re
##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence
def readlengthinfer(sequence):
total_readlength=0
countlist=['M','I','S','X','=']
not_countlist=['D','P','N','H']
matchs=re.findall(r"(\d+)([A-Z=])",sequence)
for match in matchs:
if match[1] in countlist:
total_readlength+=int(match[0])
return total_readlength
samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
seq_path = sys.argv[2] #Path to the reference genome in 2bit format
##maxTRlength=int(sys.argv[4])
##maxoriginalreadlength=int(sys.argv[5])
maxTRlength=int(sys.argv[3])
maxoriginalreadlength=int(sys.argv[4])
outfile=sys.argv[5]
fout = open(outfile,'w')
twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
skipped=0
while True:
read = samf.readline().strip()
if not(read): #EOF reached
break
if read[0] == "@":
#print read
continue
mate = samf.readline().strip()
if not(mate): #EOF reached
break
read_elems = read.split()
mate_elems = mate.split()
read_name = read_elems[0].strip()
mate_name = mate_elems[0].strip()
while True:
if read_name == mate_name:
break
elif read_name != mate_name:
#print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
#break
skipped += 1
read = mate
read_elems = mate_elems
mate = samf.readline().strip()
read_name = read_elems[0].strip()
mate_name = mate_elems[0].strip()
if not(mate): #EOF reached
break
mate_elems = mate.split()
#extract XT:A tag
#for e in read_elems:
# if e.startswith('XT:A'):
# read_xt = e
#for e in mate_elems:
# if e.startswith('XT:A'):
# mate_xt = e
#if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely
# continue
read_chr = read_elems[2]
read_start = int(read_elems[3])
read_cigar = read_elems[5]
# if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
# continue
# read_len = int(read_cigar.split('M')[0])
read_len=readlengthinfer(read_cigar)
mate_chr = mate_elems[2]
mate_start = int(mate_elems[3])
mate_cigar = mate_elems[5]
# if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
# continue
# mate_len = int(mate_cigar.split('M')[0])
mate_len=readlengthinfer(mate_cigar)
if read_chr != mate_chr: # check that they were mapped to the same chromosome
continue
if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
continue
if read_start < mate_start:
pre_s = read_start-1
pre_e = read_start-1+read_len
tr_s = read_start-1+read_len
tr_e = mate_start-1
suf_s = mate_start-1
suf_e = mate_start-1+mate_len
else:
pre_s = mate_start-1
pre_e = mate_start-1+mate_len
tr_s = mate_start-1+mate_len
tr_e = read_start-1
suf_s = read_start-1
suf_e = read_start-1+read_len
tr_len = abs(tr_e - tr_s)
if tr_len > maxTRlength:
continue
if pre_e >= suf_s: #overlapping prefix and suffix
continue
tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
print "Skipped %d unpaired reads" %(skipped)