-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathindel_divergence.py
executable file
·82 lines (60 loc) · 2.86 KB
/
indel_divergence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
from __future__ import print_function
import argparse
import subprocess
from qsub import q_sub
import sys
def popen_grab(cmd):
output_lines_list = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].split('\n')[:-1]
return output_lines_list
def main():
# arguments
parser = argparse.ArgumentParser()
parser.add_argument('-wga', help='Whole genome alignment bed file', required=True)
parser.add_argument('-bed', help='Coordinates to calc divergence for, bed format', required=True)
parser.add_argument('-chromo', help=argparse.SUPPRESS, default='all')
parser.add_argument('-out', help='Output path and file', required=True)
args = parser.parse_args()
if args.chromo == 'all':
chromo_list = popen_grab('zcat {} | cut -f 1 | uniq'.format(args.wga))
with open(args.out, 'w') as out_file:
print('chromo\tn_variants\tcallable\tdivergence\tindel_type', file=out_file)
for x in chromo_list:
cmd = ' '.join(sys.argv) + ' -chromo {}'.format(x)
outs = args.out.replace('.txt', '') + '_{}'.format(x)
q_sub([cmd], out=outs, evolgen=True)
sys.exit()
else:
callable_cmd = ('zgrep -w ^{} {} | bedtools intersect -a stdin -b {} | '
'~/WGAbed/wga_bed_summary.py -callable').format(
args.chromo, args.wga, args.bed)
print(callable_cmd, file=sys.stdout)
call_sites = popen_grab(callable_cmd)[0].split('\t')
n_sites = int(call_sites[1])
for var_type in ['indel', 'deletion', 'insertion']:
if var_type == 'indel':
# folded data
indel_cmd = ('zgrep -w ^{} {} | bedtools intersect -a stdin -b {} | '
'~/WGAbed/wga_bed_indels.py '
'-min_coverage 3 -max_length 50 '
'-ref_specific | wc -l').format(
args.chromo, args.wga, args.bed)
else:
# unfolded data
indel_cmd = ('zgrep -w ^{} {} | bedtools intersect -a stdin -b {} | '
'~/WGAbed/wga_bed_indels.py '
'-max_length 50 -min_coverage 3 '
'-ref_specific | '
'~/WGAbed/polarise_wga_ref_indels.py '
'-indel_type {} | wc -l').format(
args.chromo, args.wga, args.bed, var_type)
print(indel_cmd, file=sys.stdout)
n_indels = int(popen_grab(indel_cmd)[0])
if n_sites == 0:
div = 0.0
else:
div = float(n_indels)/float(n_sites)
with open(args.out, 'a') as out_file:
print(args.chromo, n_indels, n_sites, div, var_type, file=out_file, sep='\t')
if __name__ == '__main__':
main()