forked from Lammlab/Resic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenome_3nt.py
executable file
·349 lines (296 loc) · 16.3 KB
/
genome_3nt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""
Functions to sort/combine nucleotide pairs/combine lines of multiline formats, and to revert combined formats to regular form
Usage:
genome_3nt.py FASTA_FILENAME FASTA_OUTPUT FASTQ_FILENAME FASTQ_OUTPUT
Options:
-FASTA_FILENAME fasta filename
"""
from numba.roc.hsaimpl import lower
from Utility.generators_utilities import class_generator
from Utility.parallel_generator import parallel_generator
import subprocess
from Utility.Fastq_class import Fastq
from Utility.Samfile_class import SamLine
import re
from Processing import sam_sorting
from docopt import docopt
import itertools
from Utility.multiline_sort import multiline_sort
import logging
import os
import shutil
import sys
def pre(fasta_file,fastq_file,out_fasta_file,out_fastq_file, nt_replacement):
if type(nt_replacement) is list:
nt_replacement = "".join(nt_replacement)
if fasta_file == out_fasta_file:
fasta_file_temp = fasta_file + "_temp_copy.fasta"
shutil.copyfile(fasta_file, fasta_file_temp)
fasta_nt = fasta_to_3nt(nt_replacement, fasta_file_temp,output_file=out_fasta_file)
os.remove(fasta_file_temp)
else:
fasta_nt = fasta_to_3nt(nt_replacement, fasta_file,output_file=out_fasta_file)
#logging.info(f"==exec==\nmultiline_to_3nt({2},{'~'}, {nt_replacement}, {fasta_file},output_file={out_fasta_file})")
#logging.info(f"==exec==\nmultiline_to_3nt({2},chr(127), {nt_replacement}, {fastq_file},output_file={out_fastq_file})")
if fastq_file is not None:
fastq_nt = multiline_to_3nt(4, chr(127), 2, nt_replacement, fastq_file , output_file=out_fastq_file)
else:
fastq_nt = None
return fastq_nt, fasta_nt
def post(fastq_file, sam_file, out_fastq_file, out_sam_file, **kwargs):
#logging.info(f"==exec==\nrevert_fastq_to_4nt({fastq_file}, kwargs['fastq_lib'],output_filename=out_fastq_file)multiline_to_3nt({2},chr(127), {fastq_file},output_file={out_fastq_file})")
if fastq_file is not None:
fastq_ntr = revert_fastq_to_4nt(fastq_file, kwargs['fastq_lib'],output_filename=out_fastq_file)
else:
fastq_ntr = None
#logging.info(f"==exec==\nmultiline_to_3nt({2},chr(127), {fastq_file},output_file={out_fastq_file})")
sam_ntr = revert_sam_to_4nt(sam_file, output_filename=out_sam_file,fastq_lib=kwargs['fastq_lib'])
return fastq_ntr, sam_ntr
# nt_pairings = {'AG': 'A', 'GA': 'A', 'CTU': 'U', 'AC': 'C', 'CA': 'C', 'CG': 'G', 'GC': 'G'}
nt_pairings = {"{x}{y}".format(x=x, y=y): min(x, y) for x, y in itertools.product("AGCT", repeat=2)}
def fasta_to_3nt(nt_replacement, filename, output_file=None):
'''
alternative function to multiline_to_3nt for fasta files with multiple lines for the sequence, instead of a single
line with the entire sequence
:param nt_replacement: nucleotides to combine (ex: 'AG', 'CT')
:param filename: filename for the input
:param output_file: filename for the output, if desired
:return: output filename
'''
if output_file is None:
splited = filename.split(".")
output_file = ".".join(splited[:-1])+"_nt."+splited[-1]
with open(filename, 'r'), open(output_file, 'w'):
# TODO: does case matter?
#if type(nt_replacement) is list:
#nt_replacement = "".join(nt_replacement)
#lower_case_nt = nt_replacement.lower()
lower_case_replacement = nt_pairings[nt_replacement].lower()
#print(lower_case_replacement)
command = f"cat {filename} | sed -r '/^[ \t]*$/d' | sed '/^[NAGCTnagct]/s/[{nt_replacement}]/{nt_pairings[nt_replacement]}/g' "\
f" | sed '/^[NAGCTnagct]/s/{[nt_replacement.lower()]}/{lower_case_replacement}/g' > {output_file} "
#command = f"cat {filename} | sed -r '/^[ \t]*$/d' | sed '/[AGCTagct]*/s/[{nt_replacement}]/{nt_pairings[nt_replacement]}/ig' > {output_file} "
#^ also works
# print(command)
try:
subprocess.run(command, shell=True)
except Exception as e:
raise e
return output_file
def multiline_to_3nt(number_of_lines, new_delim, sequence_line, nt_replacement, multi_filename=None, output_file=None):
'''
:param number_of_lines: number of lines in each entry
:param new_delim: any character that is not within the alphabet of possible characters that are used in the format
:param sequence_line: the number of the line containing the sequence to be modified
:param nt_replacement: the nucleotides to be combined
:param multi_filename: name of the file containing the data
:param output_file: optional filename to write output to. if not specified output goes to input filename with .nt extension
:return:
'''
if output_file is None:
splited = multi_filename.split(".")
output_file = ".".join(splited[:-1])+"_nt."+splited[-1]
with open(multi_filename, 'r'), open(output_file, 'w'):
# TODO: does case matter?
if type(nt_replacement) is list:
nt_replacement = "".join(nt_replacement)
#command = f"cat {multi_filename} | sed -r '/^[ \t]*$/d' |paste -d \"{new_delim}\" {' -'*int(number_of_lines)} \
#| awk -F{new_delim} 'BEGIN {{OFS = \"{new_delim}\"}}; {{gsub(/[{nt_replacement}]/,\"{nt_pairings[nt_replacement]}\",${sequence_line})}};{{print}};'\
#| sed 's/{1}/\\n/g' > {output_file} "
#| awk -F{new_delim} 'BEGIN {{OFS = \"{new_delim}\"}}; {{gsub(/[tolower({nt_replacement})]/,tolower(\"{nt_pairings[nt_replacement]}\"),${sequence_line})}};{{print}}' \
#OLD command with number variable replacement rather than format string replacement.
command = "cat {0} | sed -r '/^[ \t]*$/d' |paste -d \"{1}\" {2}\
| awk -F{1} 'BEGIN {{OFS = \"{1}\"}}; {{${5} = toupper(${5}); gsub(/[{3}]/,\"{4}\",${5})}};{{print}}' |" \
"sed 's/{1}/\\n/g' > {6} ".format(multi_filename, new_delim, ' -'*int(number_of_lines), nt_replacement, nt_pairings[nt_replacement], sequence_line, output_file)
# print(command)
#bash command for fastqs, AT/at to A/a
#"cat smallseq.fastq | sed -r '/^[ \t]*$/d' |paste -d "~" - - - -| awk -F~ 'BEGIN {OFS="~"};{gsub(/[AT]/,"A",$2);gsub(/[at]/,"a",$2)};{print}' | sed 's/~/\n/g' >smallseq.nt.fastq"
try:
subprocess.run(command, shell=True)
except Exception as e:
raise e
return output_file
def revert_fastq_to_4nt(fastq_3nt_filename, fastq_4nt_filename, output_filename=None):
'''
:param fastq_3nt_filename: negative reads from alignment of 3nt fastq library
:param fastq_4nt_filename: original 4nt fastq library
:param output_filename:
:return:
'''
sorted_fastq_3nt = multiline_sort(4,'~',1, fastq_3nt_filename)
sorted_fastq_4nt = multiline_sort(4,'~',1, fastq_4nt_filename)
if output_filename is None:
splited = fastq_3nt_filename.split(".")
output_filename = ".".join(splited[:-1]) + "_ntR." + splited[-1]
with open(sorted_fastq_3nt, 'r') as fastq_3nt, open(sorted_fastq_4nt, 'r') as fastq_4nt:
gen_3nt = class_generator(Fastq, file = fastq_3nt, number_of_lines=4)
gen_4nt = class_generator(Fastq, file = fastq_4nt, number_of_lines=4)
get_fastq_id = lambda fq: fq.strings[0]
with open(output_filename, 'w') as out:
p_gen = parallel_generator([gen_3nt, gen_4nt], [get_fastq_id, get_fastq_id])
for [fq_3, fq_4] in p_gen:
if (fq_3 is not None) and (fq_4 is not None):
fq_3[0].strings[1] = fq_4[0].strings[1]
out.write(re.sub('\n\n', '\n',str(fq_3[0])))
return output_filename
def reverse_multiline(number_of_lines, new_delim, sequence_line, multi_filename, output_file=None):
'''
:param number_of_lines: number of lines in each entry
:param new_delim: any character that is not within the alphabet of possible characters that are used in the format
:param sequence_line: the number of the line containing the sequence to be modified
:param nt_replacement: the nucleotides to be combined
:param multi_filename: name of the file containing the data
:param output_file: optional filename to write output to. if not specified output goes to input filename with .nt extension
:return:
'''
if output_file is None:
splited = multi_filename.split(".")
output_file = ".".join(splited[:-1])+"_nt."+splited[-1]
with open(multi_filename, 'r'), open(output_file, 'w'):
#OLD command with number variable replacement rather than format string replacement.
command = "cat {0} | sed -r '/^[ \t]*$/d' |paste -d \"{1}\" {2}\
| awk -F{1} 'BEGIN {{OFS = \"{1}\"}};{{cmd=\"echo \"${3}\" | rev\"; cmd | getline reversed_line ; close(cmd) ;gsub(${3},reversed_line,${3})}};{{print}}' |" \
"sed 's/{1}/\\n/g' > {4} ".format(multi_filename, new_delim, ' -'*int(number_of_lines), sequence_line, output_file)
# print(command)
#bash command for fastqs, AT/at to A/a
#"cat smallseq.fastq | sed -r '/^[ \t]*$/d' |paste -d "~" - - - -| awk -F~ 'BEGIN {OFS="~"};{gsub(/[AT]/,"A",$2);gsub(/[at]/,"a",$2)};{print}' | sed 's/~/\n/g' >smallseq.nt.fastq"
try:
subprocess.run(command, shell=True)
except Exception as e:
raise e
return output_file
def complement_multiline(number_of_lines, new_delim, sequence_line, multi_filename, output_filename=None):
'''
Complement the sequence line of a multilined format file. Nucleotides are switched to their appropriate base pair (A->T, C->G, etc)
:param number_of_lines: number of lines in each entry
:param new_delim: any character that is not within the alphabet of possible characters that are used in the format
:param sequence_line: the number of the line containing the sequence to be modified
:param nt_replacement: the nucleotides to be combined
:param multi_filename: name of the file containing the data
:param output_file: optional filename to write output to. if not specified output goes to input filename with .nt extension
:return:
'''
#TODO: only works on fasta now
if output_filename is None:
splited = multi_filename.split(".")
output_filename = ".".join(splited[:-1])+"_complement."+splited[-1]
with open(multi_filename, 'r') as fasta_file, open(output_filename, 'w') as out:
translation = str.maketrans('ACGTNacgtn', 'TGCANtgcan')
#TODO: use "tr AGCTagct TCGAtcga" command to flip the nucleotides
fasta_gen = class_generator(str, file = fasta_file)
for line in fasta_gen:
if line[0] != '>':
line = line.translate(translation)
out.write(line.strip("\n") + "\n")
return output_filename
def reverse_complement_sequence(sequence):
'''
reverse complement of sequence of nucleotides
:return: reversed sequence
'''
reverse_dict = {'a':'t', 't':'a', 'g':'c', 'c':'g', 'A':'T', 'T':'A','G':'C' ,'C':'G', 'N':'N', 'n':'n'}
reversed_sequence = ''
for nt in reversed(sequence):
reversed_sequence += reverse_dict[nt]
return reversed_sequence
def revert_samline_to_4nt(samline, fastq_entry):
'''
replace the sequence field of a SAM file line with the sequence from a fastq entry
:param samline: samline to revert. Of type SamLine, from Utility.Samfile_class
:param fastq_entry: fastq entry to use as reference
:return: the updated SamLine object
'''
fastq_sequence = fastq_entry.get_seq().strip("\n")
if samline.flags == 16:
fastq_sequence = reverse_complement_sequence(fastq_sequence)
samline.sequence = fastq_sequence
return samline
def flip_sam_flags(samfile, flag_to_replace, new_flag, output_filename=None):
if output_filename is None:
splited = samfile.split(".")
output_filename = ".".join(splited[:-1]) + "_flipped." + splited[-1]
#create temp file if input and output filenames are the same
if samfile == output_filename:
temp_name=samfile+"temp"
shutil.copyfile(samfile, temp_name)
samfile = temp_name
with open(samfile, 'r') as sam_file, open(output_filename, 'w') as out:
header_gen = class_generator(str, skip_condition = (lambda line: line[0]!='@'), file=sam_file)
for line in header_gen:
out.write(str(line))
with open(samfile, 'r') as sam_file, open(output_filename, 'a') as out:
sam_gen = class_generator(SamLine, skip_condition = (lambda line: line[0]=='@'), file = sam_file)
for line in sam_gen:
if line.flags == flag_to_replace:
line.flags = new_flag#samclass turns flag to string using str(new_flag), turning(for example) 16->20
out.write(str(line) + '\n')
if samfile == output_filename:
os.remove(temp_name)
return output_filename
def sort_sam_alphabetic(samfile):
splited = samfile.split(".")
sorted_name = ".".join(splited[:-1]) + "_sorted." + splited[-1]
command = "cat {0} | sed -r '/^[ \t]*$/d' | LC_ALL=C sort -k1,1n > {1}".format(samfile,sorted_name)
try:
subprocess.run(command, shell = True)
except Exception as e:
raise(e)
return sorted_name
def revert_sam_to_4nt(aligned_3nt_sam_filename, output_filename=None, sorted=False, **kwargs):
'''
:param aligned_3nt_sam_filename: filename of sam format file, which should contain sam lines that correspond to
a library of reads in fastq format to revert the sam file according to.
:param output_filename: filename to write output to
:param kwargs: fastq_lib: the original library of sequence reads that was used to align the sam file
:return: None, in case of error
'''
if 'fastq_lib' not in kwargs:
print("no reference library provided")
return None
if output_filename is None:
splited = aligned_3nt_sam_filename.split(".")
output_filename = ".".join(splited[:-1]) + "_ntR." + splited[-1]
sequence_library_filename = kwargs['fastq_lib']
if not sorted:
#sort library of fastq reads according to sequence id
sorted_lib_filename = multiline_sort(4, "~", 1, sequence_library_filename)
#sort sam file according to query sequence id
sorted_sam_filename = sort_sam_alphabetic(aligned_3nt_sam_filename)
else:
sorted_lib_filename = sequence_library_filename
sorted_sam_filename = aligned_3nt_sam_filename
#create temp file to avoid overwriting input with output if same input and output filenames
if output_filename == aligned_3nt_sam_filename:
temp_name=sorted_sam_filename+"temp_copy"
shutil.copyfile(sorted_sam_filename, temp_name)
sorted_sam_filename = temp_name
#add the header lines
with open(sorted_sam_filename, 'r') as sam_file, open(output_filename, 'w') as out:
header_gen = class_generator(str, skip_condition = (lambda line: line[0]!='@'), file=sam_file)
for line in header_gen:
out.write(str(line))
#add the reverted sam lines
with open(sorted_sam_filename, 'r') as sam_file, open(sorted_lib_filename, 'r') as lib_file, open(output_filename, 'a') as out:
aligned_3nt_gen = class_generator(SamLine, skip_condition = (lambda line: line[0]=='@'), file = sam_file)
seq_library_gen = class_generator(Fastq, file = lib_file, number_of_lines=4)
#TODO: update functors when fastq_class/samfile_class are updated
get_sam_qname = lambda x: x.seq_id.split(" ")[0]
get_fastq_id = lambda x: x.strings[0][1:].split(" ")[0]
p_gen = parallel_generator([aligned_3nt_gen, seq_library_gen], [get_sam_qname, get_fastq_id])
for [samlines, fastq_entry] in p_gen:
#print("samlines: ",samlines,"fastq_entry: ",fastq_entry)
if (samlines is not None and fastq_entry is not None):
for samline in samlines:
reverted_samline = revert_samline_to_4nt(samline, fastq_entry[0])
out.write(str(reverted_samline) + "\n")
if output_filename == aligned_3nt_sam_filename:
os.remove(temp_name)
return output_filename
#ignore main, not for use
if __name__ == '__main__':
arguments = docopt(__doc__)
try:
multiline_to_3nt(4,'~',2,'AG',arguments['FASTQ_FILENAME'],arguments['FASTQ_OUTPUT'])
multiline_to_3nt(2,'~',2,'AG',arguments['FASTA_FILENAME'], arguments['FASTA_OUTPUT'])
except Exception as e:
print("error")