forked from Lammlab/Resic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_editing_percent.py
executable file
·483 lines (389 loc) · 23.4 KB
/
analyze_editing_percent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
"""analyze_editing_percent
Usage:
analyze_editing_percent.py param
analyze_editing_percent.py example
analyze_editing_percent.py --reads_min=<thresh> --edit_min=<percent> --noise_max=<percent> [ --debug --tag=<tag_string> ] -s <summary_output_filename> <pileup_files>...
analyze_editing_percent.py --reads_min=<thresh> --edit_min=<percent> --noise_max=<percent> [ --debug --tag=<tag_string> --add_csv_headers --out_dir=<out> ] <summary_output_filename> <pileup_files>...
analyze_editing_percent.py -h | --help
Options:
-h --help Show this screen.
-r --reads_min=<thresh> Minimum ammount of reads requires to flag a site as an editing site
-e --edit_min=<percent> Minimum percent of reads required from the most frequent bp change to flag as an editing site
-n --noise_max=<percent> Maximum percent of other bp changes to flag as an editing site
-o --out_dir=<out> Directory for putting output pileup files relevant if changing the pileup files default value is dir of the summary file
-s --summarise_only Do not output new pileups. only produce the summary file Cannot be combined with --tag or --add_csv_headers
-t --tag=<tag_string> A tag to add to sites that where considered editing sites
-v --add_csv_headers Out put the pileups with a csv style header
-d --debug use for debug options - will not run the commands in parallel, but one by one
"""
from Utility.generators_utilities import getLineFromChunk, class_generator
from Utility.Pileup_class import Pileup_line
import itertools
import pandas as pd
import os
import re
from docopt import docopt
import sys
import subprocess
from Utility.logger_utils import get_Pipes_dir
from Experiments.forontiers_jupyter.parallel_commands import parallel_commands
def is_pileup_line_edited(pileup_line: Pileup_line, read_thresh_hold=1, editing_min_percent_threshold=0.,
editing_max_percent_threshold=100, noise_percent_threshold=100, const_tag='', editing_read_thresh=0):
"""
adds editing percentage and noise percentage to pileup line, tags as editing site if passed thresholds
:param: pileup_line, pileup line
:param: read_thresh_hold, minimum number of reads required for a valid editing site
:param: editing_min_percent_threshold, the minimum value of valid editing_percent
:param: editing_max_percent_threshold, the maximum value of valid editing_percent
:param: noise_percent_threshold, the maximum value of valid noise_percent
:param: const_tag, string representing constant tag that will be added for lines that are considered as editing site
:param add_percents: whether to add percents or to leave lines unchanged
:return: (editing_type,pileup_line) where editing_type is a tuple of the form (from_base,to_base)
and pileupline is the oirginal line with the new fields in
"""
clean_str = pileup_line.clean_base_string
sense_string = clean_str.replace(',', '.')
# TODO before use assumed letters are only upper case. I fix this assumption here
sense_string = sense_string.upper()
# Finds the candidate nucleotide reads
nucl_A, nucl_C, nucl_T, nucl_G = 0, 0, 0, 0
nucl_changes = {"A": 0, "C": 0, "G": 0, "T": 0}
for nucl in list(sense_string):
if (nucl == 'A' or nucl == 'C' or nucl == 'G' or nucl == 'T'):
nucl_changes[nucl] = nucl_changes[nucl] + 1
else:
continue
# get the maximal nucleous change, key is the value of the dict
(candidate_nucl, candidate_nucl_reads) = max(nucl_changes.items(), key=lambda x: x[1])
if candidate_nucl_reads == 0:
editing_percent = 0.0
noise_percent = 0.0
else:
editing_percent = round(float(100 * (candidate_nucl_reads / pileup_line.base_count)), 3)
# Find the noise reads
# all ACTGs that are not candidate_nucl
noise_reads = 0
for nucl in list(pileup_line.reads_string.upper()):
if (nucl != candidate_nucl) and (nucl in ['A', 'C', 'T', 'G']):
noise_reads += 1
noise_percent = round(float(100 * (noise_reads / pileup_line.base_count)), 3)
if is_reads_threshold_match(pileup_line, read_thresh_hold) and (editing_percent >= editing_min_percent_threshold
and editing_percent <= editing_max_percent_threshold) and (noise_percent <= noise_percent_threshold) and (
editing_percent is not 0.0) and (candidate_nucl_reads >= editing_read_thresh):
is_editing_site = True
else:
is_editing_site = False
'''
if (pileup_line._gene_pos == "10517276"):
print("pileupline",pileup_line, "read_thresh_hold",read_thresh_hold, "editing_percent",editing_percent,
"noise_percent", noise_percent, "candidate_nucl_reads", candidate_nucl_reads)
print("editing_min_percent_threshold",editing_min_percent_threshold, "editing_max_percent_threshold",
editing_max_percent_threshold, "noise_percent_threshold",noise_percent_threshold, "editing_read_thresh", editing_read_thresh)
'''
# change pileup line
pileup_line.tags["editing_percent"] = str(editing_percent)
pileup_line.tags["noise_percent"] = str(noise_percent)
if (const_tag != '') and is_editing_site:
pileup_line.tags["const_tag"] = const_tag
# calculate editing type
if is_editing_site:
#TODO patch change to make sure we dont get lowercase reference or nucl change
editing_type = (pileup_line.reference.upper(), candidate_nucl.upper())
#editing_type = (pileup_line.reference, candidate_nucl)
else:
editing_type = 'unchanged'
return editing_type, pileup_line
def is_reads_threshold_match(pileup_line, treshold):
return pileup_line.base_count >= treshold
def filter_pileup_by_categories(pileup_filename, output, reads_threshold=None, any_change=None, editing_min_percent=None,
editing_max_percent=None, noise_percent=None, editing_read_thresh=0):
"""
this function filtered a pileup file by reads threshold *or* by any change (editing percent > 0) *or* by editing
percent threshold and noise_percent threshold
:param pileup_filename: pileup file path
:param output: the output path for the filtered pileup
:param reads_threshold: if we want to filter by reads threshold - put a number in this var
:param any_change: if we want to filter by any change (more then 0 editing percent) put true in this var
:param editing_min/max_percent: if we want to filter by editing percent threashold, put threshold here
:param noise_percent: if we want to filter by noise_percent threashold, put threshold here
"""
with open(pileup_filename) as pileup, open(output, "w") as out:
for line in class_generator(Pileup_line, file=pileup):
if reads_threshold is not None and is_reads_threshold_match(line,reads_threshold):
out.write(str(line) + "\n")
if any_change:
editing_type, pileup_line = is_pileup_line_edited(line)
if editing_type != 'unchanged':
out.write(str(line) + "\n")
if editing_min_percent is not None:
if all([editing_max_percent, noise_percent]):
editing_type, pileup_line = is_pileup_line_edited(line, editing_min_percent_threshold=editing_min_percent,
editing_max_percent_threshold=editing_max_percent,
noise_percent_threshold=noise_percent,
editing_read_thresh=editing_read_thresh)
elif editing_max_percent is not None:
editing_type, pileup_line = is_pileup_line_edited(line, editing_min_percent_threshold=editing_min_percent, editing_max_percent_threshold=editing_max_percent, editing_read_thresh=editing_read_thresh)
elif noise_percent_threshold is not None:
editing_type, pileup_line = is_pileup_line_edited(line, editing_min_percent_threshold=editing_min_percent, noise_percent_threshold=noise_percent, editing_read_thresh=editing_read_thresh)
else:
editing_type, pileup_line = is_pileup_line_edited(line, editing_min_percent_threshold=editing_min_percent, editing_read_thresh=editing_read_thresh)
if editing_type != 'unchanged':
out.write(str(line) + "\n")
def get_header_line(const_tag=''):
if const_tag == '':
optional_fields_list = ['editing_min_percent', 'noise_percent']
else:
optional_fields_list = ['editing_min_percent', 'noise_percent', 'const_tag']
columns_list = ['Reference Id', 'Gene Position', 'Reference', 'Base Count', 'Reads String', 'Quality String']
columns_list += optional_fields_list
columns_list.append('Misc Tags')
header_line = '\t'.join(columns_list)
return header_line
def analyse_editing_percent(pileup_file, out_file, summary_file=None, add_headers=False, summary_only=False, min_editing=0,
max_noise=100, min_reads=1, edit_tag='' ):
"""
analyses pileup file editing sites and summarises it
@param pileup_file: input pileup file name
@param out_file: name of file to write new lines
@param summary_file: File to put the summary_string
@param add_headers: Boolean, whether to add csv like headers to new pileups
@param summary_only: Boolean, whether to only generate summary file and not generate edited pileup files
@param min_editing: minimal editing threshold, percent
@param max_noise: maximal noise threshold, percent
@param min_reads: minimal reads per site to be considered an editing site, int
@param edit_tag: tag to add to sites that are classified as editing sites
@param kwargs:
@return: dict with histogram of site editing types, side effect creates out_file a pile up file with tags
indicating noise, editing percent and wether a given site was classified as an editing site
"""
# with open in read and out write
with open(pileup_file, 'r') as in_fp, \
open(out_file, 'w') as out_fp:
#"r","y","s","w","k","m","b","d","h","v","n"
# intitalize zero counts in summary dict
#summary_dict = {(nuc1, nuc2): 0 for nuc1, nuc2 in itertools.product('acgtACGT', repeat=2) if nuc1.upper() != nuc2.upper()}
#TODO why do we need lower case references?
summary_dict = {(nuc1, nuc2): 0 for nuc1, nuc2 in itertools.product('ACGTN', repeat=2) if nuc1.upper() != nuc2.upper()}
#add RYSWKMBDHV to string if needed
summary_dict['unchanged'] = 0
if add_headers:
out_fp.writelines([get_header_line(edit_tag)])
out_fp.write("\n")
list_headers = ["editing_min_percent", "noise_percent"]
if edit_tag:
list_headers.append("const_tag")
pileup_gen = class_generator(Pileup_line, file=in_fp)
line_num = -1
for line_num, pileup_line in enumerate(pileup_gen):
edit_type, new_pileup_line = is_pileup_line_edited(pileup_line,
read_thresh_hold=min_reads,
editing_min_percent_threshold=float(min_editing),
noise_percent_threshold=float(max_noise),
const_tag=edit_tag
)
summary_dict[edit_type] += 1
# add empty tag if we make it into a csv so the columns will align
if add_headers and edit_type == 'unchanged' and edit_tag != '':
new_pileup_line.tags["const_tag"] = ''
# if not summarise only, print the pileup line
if not summary_only:
#out_fp.write("\n")
if not add_headers:
out_fp.writelines([str(new_pileup_line)])
out_fp.write("\n")
else:
out_fp.writelines([new_pileup_line.line_to_csv_with_short_tags(list_headers)])
out_fp.write("\n")
# dont skip file, create empty summary
#if line_num == -1 :
# open(summary_file, "w").close()
# return #when the file is empty - skip the file!
##out_fp.write("\n")
# make summary counts into percentage
total_line_number = line_num + 1
summary_dict_sub = {key: float(val) / total_line_number if total_line_number != 0 else 0 for key, val in summary_dict.items() }
# printing the summary format to an individual file
with open(summary_file, "w") as file1:
summer_str = individual_summary_format(pileup_file, summary_dict_sub)
file1.write(summer_str)
return summer_str
def individual_summary_format(filename, summary_dict):
"""
@param filename: the pileup file name, the dict refers to
@param summary_dict: the summary dict of the pileup file
@return: a string that represent the summary dict : pileup_filename\nchange:percentage\tchange:percentage\t...
"""
summer_str = ""
for change, percentage in summary_dict.items():
if summer_str == "":
summer_str = str(change) + ":" + str(percentage)
else:
summer_str = summer_str + "\t" + str(change) + ":" + str(percentage)
return filename + "\n" + summer_str
def individual_summary_format_to_dict(lines):
"""
@param lines: the lines of the summary file
@return: a dict that represent the lines (with the pileup name inside)
"""
summary_dict = dict()
summary_dict["name of file"] = lines.pop(0).split('\n')[0]
for line in lines[0].split("\t"):
change_and_percentage = line.split(":")
summary_dict[change_and_percentage[0]] = change_and_percentage[1]
return summary_dict
def summary_format(summary_dict):
"""
formats summary dicts into desired format
:param summary_dict: dictionary with percents per
:return: str of dict in desired format
"""
return str(summary_dict.items())
def analyse_multiple_editing_percent_files(pileup_files, out_pileups, summary_files,total_summary_file=None, add_headers=False,
summary_only=False, min_editing=0.0, max_editing=100.0, max_noise=100.0, min_reads=1,
edit_tag=None, parallel_limit=2,Disable_parallel=True ):
"""
analyses pileup file and summarises it
:param pileup_files: list of pileup_files
:param out_pileups: list of path names for pileups with editing percent
:param summary_file: list of path names for summary of all pileups
:param total_summary_file: list of path name for summary of all pileups
:param add_headers: Boolean, whether to add csv like headers to new pileups
:param summary_only: Boolean, whether to only generate summary file and not generate edited pileup files
:param min_editing: minimal editing threshold, percent (with the dot : 0.01 = 1%)
:param max_noise: maximal noise threshold, percent
:param min_reads: minimal reads per site to be considered an editing site, int
:param edit_tag: tag to add to sites that are classified as editing sites
:param parallel_limit: cap on number of parallel processes to run (passes to parallel_commands)
:param Disable_parallel: Bool wether to do processes one by one or run in parallel (passes to parallel_commands)
:return:
"""
# collecting commands for parallel
commands = []
for in_file, out_file, summary_file in zip(pileup_files, out_pileups, summary_files):
# False,False map to add_headers and summary_only
command = [analyse_editing_percent,in_file,out_file,summary_file,
add_headers,summary_only,min_editing,max_noise,min_reads,edit_tag]
commands.append(command)
# run parallel editing percent + summary printing for the pileup files
parallel_commands(commands,parallel_limit=parallel_limit,Disable_parallel=Disable_parallel)
# if total summary was requested,
# get the summary strings to write to the summary output file
if total_summary_file != None:
lines_list = get_full_summary_str(summary_files)
with open(total_summary_file, "w") as sumfile:
sumfile.write("\n".join(lines_list))
return
def get_full_summary_str(out_files):
"""
@param out_files: the list of the names of the pileup output files (with the editing percent)
@return: string contains the lines of the required summary file
"""
# creating a dict of change and percent from each file summary
keys_set = dict()
summary_list = []
for file_summer in out_files:
with open(file_summer, "r") as file1:
lines = file1.readlines()
if len(lines) == 0 :
continue
summary_dict_file = individual_summary_format_to_dict(lines)
for key in summary_dict_file.keys():
keys_set[key] = 1
summary_list.append(summary_dict_file)
lines_list = []
# preparing header
keys_set.pop("name of file", None)
header_line = "name of file"
keys_list = list(keys_set.keys())
for header in keys_list:
header_line = header_line + "\t" + str(header)
lines_list.append(header_line)
for dict_summer in summary_list:
line = dict_summer["name of file"]
for change in keys_list:
if change in dict_summer.keys():
line = line + "\t" + dict_summer[change] # when the base change is in that summeray file
else:
line = line + "\t " # when the change is not there - put empty stub
lines_list.append(line)
return lines_list
def print_params():
string = "The parameters are:" + "\n" \
+ "-editing_min_percent_threshold: the minimum editing percent that should be for the line that is considered as editing site.\n" \
+ "-editing_max_percent_threshold the maximum editing percent that should be for the line that is considered as editing site.\n" \
+ "-noise_percent_threshold: the maximum noise percent (\"5\" = 5%) that should be for the line that is considered as editing site.\n" \
+ "-out_filename: the name of the statistic output file (file that contain for each file the percent of lines that are considered as editing site).\n" \
+ "-pileup_files: list of pileup filenames.\n" \
+ "-out: Option to specify the name of the output directory (the default is the current directory).\n" \
+ "-const_tag: Option to add a constant tag string to each line in the file that is considered editing site.\n" \
+ "-noadd: Flag indicating that the pileup files already have 2 extra columns of editing_percent and noise_percent, and therefore I will not add these 2 extra columns.\n" \
+ "-addheaders: Choose this flag if you would like to add headers to each pileup file (used in order to parse the pileup to csv format easily).\n"
print(string)
def print_example():
input_pileup_str = "seq1\t272\tA\t24\t,.$....G,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\n" \
+ "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\n" \
+ "seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\n" \
+ "seq1\t275\tT\t23\t,.....,,,,,,....,..,\t<<<<<<<<<<<<<<<<<<<<\n" \
+ "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA\t<<<<<<<<<<<<<<<<<<<<<\n" \
+ "seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\n"
print("The pileup file is:\n" + input_pileup_str + "\n")
output_pileup_str = "seq1\t272\tA\t24\t,.$....G,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:12.5\tnoise_percent:Z:0.0\n" \
+ "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:20.0\tnoise_percent:Z:0.0\n" \
+ "seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:20.0\tnoise_percent:Z:10.0\n" \
+ "seq1\t275\tT\t23\t,.....,,,,,,....,..,\t<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:0.0\tnoise_percent:Z:0.0\n" \
+ "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA\t<<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:80.0\tnoise_percent:Z:0.0\n" \
+ "seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\tediting_percent:Z:35.0\tnoise_percent:Z:25.0\n"
print("The output pileup file (with editing percent and noise percent) is:\n" + output_pileup_str)
statistic_output_str = "\tName of File\tPercent of Valid Lines\n" \
+ "0\tpileup_filename\t50.0\n"
print(
"The statistic output with editing_min_percent_threshold=30 and noise_percent_threshold=50 is:\n" + statistic_output_str)
if __name__ == '__main__':
args = docopt(__doc__)
if args["param"]:
print_params()
sys.exit()
if args["example"]:
print_example()
sys.exit()
# Extracting the parameters
pileups_names = args['<pileup_files>']
count_threshold = int(args['--reads_min'])
editing_min_percent_threshold = float(args['--edit_min'])
noise_percent_threshold = float(args['--noise_max'])
if count_threshold < 1:
raise ValueError("count minimum cannot be nonpositive")
# Check that the range of the percent is correct
if editing_min_percent_threshold < 0 and editing_min_percent_threshold > 100 or noise_percent_threshold < 0 or noise_percent_threshold > 100 or editing_max_percent_threshold < 0 or editing_max_percent_threshold > 100:
raise ValueError("In editing or noise, Percent is not valid")
summary_filename = args['<summary_output_filename>']
out_dir = args['--out_dir']
if out_dir is None:
out_dir = os.path.dirname(summary_filename)
out_pileups=[pile+"_edited" for pile in pileups_names]
else:
summary_filename = os.path.join(out_dir, summary_filename)
pileup_base_names=[os.path.basename(pile) for pile in pileups_names]
out_pileups=[ os.path.join(out_dir,pile) for pile in pileup_base_names]
# if out dir doesnt exist make it
os.makedirs(out_dir,exist_ok=True)
# give all summary pileups the same sumamry
summary_filename=[summary_filename]*(len(pileups_names))
const_tag = args['--tag']
summarise_only = args['--summarise_only']
need_to_add_headers = args['--add_csv_headers']
debug_op = args['--debug']
'''if debug_op is None:
debug_op = False
else:
debug_op = True'''
analyse_multiple_editing_percent_files(pileups_names, out_pileups,
summary_files=summary_filename,
add_headers=need_to_add_headers,
summary_only=summarise_only,
min_editing=editing_min_percent_threshold,
max_editing=editing_max_percent_threshold,
max_noise=noise_percent_threshold,
min_reads=count_threshold,
edit_tag=const_tag,
)