-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathP6_test_exam.py
170 lines (154 loc) · 5.4 KB
/
P6_test_exam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Author: Anna-Marie Seelen
Studentnumber:1008970
Description: alignment of protein sequences to a reference protein sequence
and returning stats
Usage: python3
"""
# import statement
from sys import argv
import re
import subprocess
import os.path
# functions
def parse_input(fileref, filerelated):
"""Return protein sequences from two input files
fileref: str, name of input file with the reference protein sequence
in fasta format
filerelated: str, name of input file with the protein sequences to compare
to the reference protein sequence in fasta format
return: dict, with name of sequence as key and protein sequence as value
"""
lines=(open(fileref))
dict={}
for line in lines:
line=line.strip()
if line.startswith(">"):
line=line.replace(">", "")
occur = re.search(r'Guanine', line)
index=occur.start()
line=line[0:index-1]
dict[line]=""
else:
key=list(dict)[-1]
dict[key]+=line
lines=(open(filerelated))
for line in lines:
line=line.strip()
if line.startswith(">"):
line=line.replace(">", "")
occur = re.search(r'Guanine', line)
index=occur.start()
line=line[0:index-1]
dict[line]=""
else:
key=list(dict)[-1]
dict[key]+=line
return dict
def lenght_seq(dict):
"""Calculates the lenght of a protein seq
dict: dict, with name of sequence as key and protein sequence as value
return: dict, with name of sequence as key and sequence
lenght as value
"""
dict_with_lenght={}
for key in dict:
lenght_seq=len(dict[key])
dict_with_lenght[key]=lenght_seq
return dict_with_lenght
def write_files(dict):
#for i in range(1,len(dict))
file_ref=open("ref.fasta", "w")
file_ref.write(">{}\n".format(list(dict.keys())[0]))
file_ref.write(dict[list(dict.keys())[0]])
file_rel=open("rel.fasta", "w")
file_rel.write(">{}\n".format(list(dict.keys())[1]))
file_rel.write(dict[list(dict.keys())[1]])
file_ref.close()
file_rel.close()
return (file_ref, file_rel)
#these new files didn work when in inputted them on the comand line for
#the needle programm so I ended up using P6ref.fasta as the files.
def run_needle():
out_fn = "out.needle"
cmd = 'needle {} {}'\
.format("P6ref.fasta", "P6ref.fasta")
#I couldn't give all arguments because needle would
# give an error 8.0, 0.5 and out.needle have to be typed into the command
#line.
res = subprocess.check_output(cmd, shell=True)
return res
#out=needle(ref, rel, 8.0, 0.5, out.needle)
#print(out.needle)
def extract_alignments(out_fn):
lines=(open(out_fn))
clean_lines=[]
for line in lines:
line=line.strip()
if line.startswith("GPA1"):
line=''.join(filter(lambda ch: not ch.isdigit(), line))
line = line.replace(" ", "")
line = line.replace("GPA_ARATH", "")
clean_lines.append(line)
else:
pass
#for an even line join them, for an odd line join them
ref_lines = []
pro_lines = []
for i,line in enumerate(clean_lines):
if i%2==0:
pro_lines.append(line)
else:
ref_lines.append(line)
string_ref=""
string_rel=""
for i in ref_lines:
string_ref+=i
for i in pro_lines:
string_rel+=i
list_of_string=[]
list_of_string.append(string_ref)
list_of_string.append(string_rel)
return list_of_string
def hamming_distance(list_of_string):
"""Return the number of corresponding nucleotides that differ in two DNA sequences.
DNA_seq: list of strings with two DNA sequences
return: int, count of different corresponding nucleotides
"""
count=0
for i in range(len(list_of_string[1])):
if list_of_string[0][i] != list_of_string[1][i]:
count+=1
else:
pass
return count
def to_output_tab_file(lenght_dict, h_distance):
"""Takes a nested sorted list and outputs a tab delimited file with the contents of each sub list on a line
sorted_list: nested list with [[accession_number, organism name, GC_content, length], etc.] sorted
in ascending GC content order
return: tab delimited text file with the contents of each sub list on a line
"""
tab_file=open("P6_out_tab_delimited.txt", "w")
for key in lenght_dict:
tab_file.write("{0}\t{1}\t{2}".format(key, lenght_dict[key], h_distance))
tab_file.write("\n")
return None
def main():
"""Main function of this module"""
# step 1: parse the protein sequence from file 1 and file 2 into dict
dict=parse_input(argv[1], argv[2])
# step 2: determine the lenght of the sequences
lenght_dict=lenght_seq(dict)
# step 3: make the files for needle
file_ref, file_rel=write_files(dict)
# step 4: align protein sequences from file 1 to the other species in file 2
run_needle()
# step 5: parse needle output to extract pairwise alignments
list_of_string=extract_alignments(argv[3])
# step 5: calculate hamming distance between pairwise alignments
h_distance=hamming_distance(list_of_string)
# step 7: tab delimited file for with the alignments, containing: sequence
to_output_tab_file(lenght_dict, h_distance)
if __name__=="__main__":
main()