-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenbank2Fasta.py
executable file
·148 lines (126 loc) · 4.71 KB
/
genbank2Fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
"""
This script takes a GenBank file with and converts it to a standard fasta file.
The genbank format is structured like so:
LOCUS defline length
DEFINITION defline length
TITLE defline
ORIGIN
position seq beginning
...
position seq end
//
This is done for each fasta entry. The sequence is also broken up into segments
of 10 bases delimited by spaces. I will set "DEFINITION" as the default field for
capturing the defline while allowing the user to specify a different source.
Generally the order of sequences in the input is maintained in the output. When
defline duplicates are found they are appended to the end of the fasta in no
intentional order and a warning is printed to stderr.
To use this script simply type "python genbank2Fasta.py fileName.gb" or
"genbank2Fasta.py fileName.gb" if genbank2Fasta.py is in your PATH
Created by David E. Hufnagel on Feb 7, 2024
last update: April 26, 2024
"""
import sys, os
# Add items into a dictionary. When keys are duplicated append to a list in the value
def SaveIntoDict(key, val, dictx):
if key in dictx:
dictx[key].append(val)
else:
dictx[key] = [val]
# If an alternative defline source has been requested collect it in alt_def_source
alt_def_source = "DEFINITION"
if len(sys.argv) == 3:
alt_def_source = sys.argv[2]
if alt_def_source not in ["LOCUS", "DEFINITION", "TITLE"]:
print(
"ERROR: Incorrect defline source or too many arguments provided",
file=sys.stderr,
)
sys.exit()
# no arguments provided
elif len(sys.argv) < 2 or sys.argv[1] in [
"-h",
"--help",
"-help",
"--version",
]:
print(
"Too few arguments provided.\n\n\
Usage instructions for genbank2Fasta.py:\n\
python genbank2Fasta.py <input_file> [defline indicator, default is 'TITLE'] > <output_file>\n",
file=sys.stderr,
)
sys.exit()
# too many arguments provided
elif len(sys.argv) > 3:
print(
"Too many arguments provided.\n\n\
Usage instructions for genbank2Fasta.py:\n\
python genbank2Fasta.py <input_file> [defline indicator, default is 'TITLE'] > <output_file>\n",
file=sys.stderr,
)
sys.exit()
# Parse the input file and, for every entry, capture the defline using TITLE or
# a specified alternate source, also capture the sequence ignoring all other
# data and formatting; Save the result in a dict of key: defline val: seq.
# Also save all deflines in a list to maintain the original order.
inp = open(sys.argv[1])
def_list = []
seq_dict = {}
seq = ""
def_str = ""
is_seq = False
is_def = False
for line in inp:
if is_seq: # capture sequence data between "ORIGIN" and "//"
seq_line = "".join(line.strip().split()[1:]) # the sequence data from this line
seq += seq_line
if line.startswith("ORIGIN"): # turn on sequence capture at "ORIGIN"
is_seq = True
elif line.startswith("//"): # reset seq when at the end of the entry
SaveIntoDict(def_str, seq, seq_dict)
seq = ""
def_str = ""
is_seq = False
if line.startswith(alt_def_source): # capture defline line 1
is_def = True
# capture subsequent defline lines
elif not line.startswith(" ") and is_def == True:
is_def = False
def_list.append(def_str)
# capture defline starting with the defline indicator and running through the lines that don't start with fields
if is_def:
if line.startswith(" "):
def_str = def_str + "_" + "_".join(line.split())
else:
def_str = "_".join(line.split()[1:])
# Go through the defline list, grab sequences from the dictionary, and output
# the results in fasta format. Defline duplicates are placed at the end and
# come with a warning.
to_warn = False # a boolean for when to warn that deflines are duplicated
def_dups = []
for defline in def_list:
seqs = seq_dict[defline]
if (
len(seqs) > 1
): # When the defline is duplicated add a number to the defline and print a warning
to_warn = True
dup_num = 1
# iterate through each duplicate
for seq in seqs:
newlines = f">{defline}_dup{dup_num}\n{seq}\n"
def_dups.append(newlines)
dup_num += 1
else: # simple non-duplicated cases
seq = seqs[0]
newlines = f">{defline}\n{seq}\n"
print(newlines, end="")
# Print warning for duplicates and write them to the output after converting to a
# set so as to not duplicate the duplicates in the output.
def_dups = set(def_dups)
if to_warn:
print("WARNING: deflines are duplicated in the input file", file=sys.stderr)
for dup in def_dups:
print(dup, end="")
inp.close()