-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathP3_script.py
136 lines (121 loc) · 5.23 KB
/
P3_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""
Author: Anna-Marie Seelen
Studentnr: 1008970
Description: Script parsing argonaut formatted genbank file and outputs (1) tab delimited file with info such as
GC content and (2) a file in fasta format.
"""
from sys import argv
import subprocess
import os.path
import re
import string
def parse_input(filename):
"""Parses argonaut formatted file to extract accession number, organism name and DNA_seq and stores those in dict.
filename: str, name of argonaut formatted input file
return: nested dictionary with {accession_number:{organism_name:DNA-seq}}
"""
lines=(open(filename))
origin = False
gb_dict= {}
for line in lines:
line=line.strip()
if line.startswith("ACCESSION"):
line = line.replace("ACCESSION ", "")
gb_dict[line] = []
elif line.startswith("ORGANISM"):
organism_dict = {}
line = line.replace("ORGANISM ", "")
key = list(gb_dict)[-1]
organism_dict[line] = ""
gb_dict[key] = organism_dict
elif "ORIGIN" in line:
origin=True
elif "//" in line:
origin=False
if origin:
line=line.replace(" ", "")
line=''.join(filter(lambda ch: not ch.isdigit(), line))
#https://www.studytonight.com/python-howtos/remove-numbers-from-string-in-python
line = line.replace("ORIGIN", "")
key = list(gb_dict)[-1]
dict=gb_dict[key]
last_key=list(dict)[-1]
gb_dict[key][last_key]+=line
else:
pass
return gb_dict
def cal_GC_content(gb_dict):
"""Calculates the GC content of each DNA sequence and returns a nested list with information on the DNA sequence
gb_dict: nested dictionary with {accession_number:{organism_name:DNA-seq}}
return: nested list with [[accession_number, organism name, GC_content, length], etc.]
"""
long_list=[]
for key in gb_dict:
sequence = []
for second_key in list(gb_dict[key]):
lenght_seq=len(gb_dict[key][second_key])
sequence.append(key)
sequence.append(second_key)
count=0
for character in gb_dict[key][second_key]:
if character == "g":
count+=1
elif character=="c":
count+=1
else:
pass
GC_content=(count/lenght_seq)*100
sequence.append(GC_content)
sequence.append(lenght_seq)
long_list.append(sequence)
return long_list
def sort_nested_list(long_list):
"""Takes a nested list and sorts the list based on a particular value in the sublist
long_list: nested list with [[accession_number, organism name, GC_content, length], etc.]
return: nested list with [[accession_number, organism name, GC_content, length], etc.] sorted
in ascending GC content order
"""
long_list.sort(key = lambda x: x[2], reverse=True)
#https://www.geeksforgeeks.org/python-sort-list-according-second-element-sublist/
return long_list
def to_output_tab_file(sorted_list):
"""Takes a nested sorted list and outputs a tab delimited file with the contents of each sub list on a line
sorted_list: nested list with [[accession_number, organism name, GC_content, length], etc.] sorted
in ascending GC content order
return: tab delimited text file with the contents of each sub list on a line
"""
tab_file=open("P3_out_tab_delimited.txt", "w")
for organism in sorted_list:
tab_file.write("{0}\t{1}\t{2:.2f}\t{3}".format(organism[0], organism[1], organism[2], organism[3]))
tab_file.write("\n")
return None
def to_output_fasta_file(gb_dict, sorted_list):
"""Outputs a fasta file sorted in ascending GC content based on a nested dictionary and sorted list
gb_dict: nested dictionary with {accession_number:{organism_name:DNA-seq}}
sorted_list: nested list with [[accession_number, organism name, GC_content, length], etc.]
return: text file in fasta format sorted in ascending GC content
"""
sorted_acc_num = [item[0] for item in sorted_list]
#https://www.geeksforgeeks.org/python-get-first-element-of-each-sublist/
fasta_file=open("P3_out_fasta.txt", "w")
for acc_num in sorted_acc_num:
organism_name=list(gb_dict[acc_num])[-1]
DNA_seq=gb_dict[acc_num][organism_name].upper()
fasta_file.write(">{0} {1}\n{2}\n".format(acc_num, organism_name, DNA_seq))
return None
def main():
""" This is the main function of this module
"""
#step 1: parse the file and put accession num, organism name and dna sequence in a nested dict
gb_dict=parse_input(argv[1])
#step 2: calculate the GC content and return a nested list with gc accession num, organism name, gc content and lenght
long_list=cal_GC_content(gb_dict)
#step 3: sort the nested list based in accending order of GC content
sorted_list=sort_nested_list((long_list))
#step 4: output the nested list to a tab delimited file
to_output_tab_file(sorted_list)
#step 5: output the contents of the nested dict in ascending order of GC content to a fasta file
to_output_fasta_file(gb_dict, sorted_list)
if __name__ == "__main__":
main()