Skip to content

Commit

Permalink
preparing for publication
Browse files Browse the repository at this point in the history
  • Loading branch information
DeanLight committed Jun 9, 2021
1 parent e6e139a commit 95dd6a4
Show file tree
Hide file tree
Showing 16 changed files with 1,102 additions and 4 deletions.
5 changes: 3 additions & 2 deletions Experiments/forontiers_jupyter/bar_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def stacked_bar(data, series_labels=None, category_labels=None,

if size_plot:
fig = plt.figure(figsize=size_plot)
plt.rcParams['font.size'] = '20'

suit_colors_dict = {}
for index, column in enumerate(series_labels):
Expand Down Expand Up @@ -78,10 +79,10 @@ def stacked_bar(data, series_labels=None, category_labels=None,
cum_size += row_data

if not category_labels is None:
plt.xticks(ind2, category_labels, rotation=20, fontsize=15)
plt.xticks(ind2, category_labels, rotation=20, fontsize=30)

if y_label != None:
plt.ylabel(y_label, fontsize=15)
plt.ylabel(y_label, fontsize=30)

plt.legend()

Expand Down
Empty file modified Experiments/forontiers_jupyter/resic_run_file.py
100644 → 100755
Empty file.
135 changes: 135 additions & 0 deletions Filtering/filter_pileup_by_existing_snps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""filter_pileup_by_existing_snp.py
The script receives pileup file, cns file(/pileup file) and an output filename.
it filter out the lines from the pileup file that overlapped with the cns file if there
was a change in the read string (one at least of atcgATCG and the other IUPAC nucleotide code in the reads string ) of the cns file.
by defult it keeps the lines that didnt overlap, if flag --rm-non-overlapping is added
it filter those lines out.
Usage:
snprationotincs.py param
snprationotincs.py example
snprationotincs.py <pileup_input> <cns_file> <output_file> [<output_file_rest>] [--rm-non-overlapping]
snprationotincs.py -h | --help
Options:
-h --help Show this screen.
-r --rm-non-overlapping Remove the lines that from pileup_input that didnt match no line in cns_file
"""


# the cns_file arg can be a pileup file or a Bsortedcns file #

from Utility.parallel_generator import *
from Utility.Pileup_class import Pileup_line
import itertools
from Utility.Bsotredcns_class import Bsortedcns_line
from Utility.generators_utilities import *
from docopt import docopt
import os


'''def get_pos_and_id(line):
"""
@param line : a pileup line object
return : ref_base , gene_pos of the pileup line
"""
return line.reference_id,line.gene_pos'''


'''def is_with_any_change(pileup_line):
"""
@param pileup_line: a pileup line (object)
return : true if in the reads string there is one or more of a c g t A C G T , else false
"""
#all the possible IUPAC nucleotide code
nuc_changes = ["a","c","g","t","r","y","s","w","k","m","b","d","h","v","n"]
all_nuc_changes = [letter.upper() for letter in nuc_changes] + nuc_changes
if any(change in pileup_line.reads() for change in all_nuc_changes):
return True
else :
return False'''


def filter_pileup_by_ref_file_comparing(pileup_input, ref_file, output_file,rest_output,ref_type=Pileup_line, remove_non_matched = False):
"""
@param pileup_input: pileup file name
@param ref_file: pileup/Bsortedcns file name
@param output_file: output file name with path
@param rest_output: the name of the filterout file
@param ref_type: "pileup" or "Bsortedcns" file
@param remove_non_matched: if remove_non_matched = true it filter lines that didnt overlap out, else keeps them.
action : receives pileup file, cns file(/pileup file) and an output file.
it filter out the lines from the pileup file that overlapped with the cns file if there
was a change in the read string (one at least of atcgATCG in the reads string ).
by defult it keeps the lines that didnt overlap, if remove_non_matched = true
it filter those lines out.
the filter- out lines are printed to a file named output_file (without ending ) + "filter_out."+ (the ending)
"""


#change_functor = is_with_any_change

with open(pileup_input, "r") as in_file, open(ref_file,"r") as ref, open(output_file, "w") as out, open(rest_output,"w") as out2 :

input_gen = class_generator(Pileup_line,file=in_file)
ref_gen = class_generator(ref_type,file=ref)

get_pos_and_id = lambda x : (x.reference_id,x.gene_pos)

parallel_gen = parallel_generator([input_gen,ref_gen],[get_pos_and_id,get_pos_and_id])

for [list_from_in_file, list_from_ref_file] in parallel_gen:
# when there is a line that matched the ref - check the changes,
# if not changed then print the line to output
if (list_from_in_file is not None) and (list_from_ref_file is not None): # if there is overlap
if not(list_from_ref_file[0].is_with_any_change()): # when there is no change in the reads of the ref file (all "," or ".")
out.write(str(list_from_in_file[0]) + "\n")
else :
out2.write(str(list_from_in_file[0]) + "\n")
elif (list_from_in_file is not None) and not(remove_non_matched): # when not matched, just print (only if requested to print the non matched)
out.write(str(list_from_in_file[0]) + "\n")
elif (list_from_in_file is not None): # the removed files are printed to the filter_out file
out2.write(str(list_from_in_file[0]) + "\n")


def print_params():
pass


def print_example():
pass


if __name__ == '__main__':
import sys
args = docopt(__doc__)
if args["param"]:
print_params()
sys.exit()

if args["example"]:
print_example()
sys.exit()

# Extracting the parameters
pileup_input = args['<pileup_input>']
ref_file = args['<cns_file>']
output_file = args['<output_file>']
remove = args['--rm-non-overlapping']
rest = args['<output_file_rest>']

if ref_file.split(".")[-1] == "Bsortedcns":
ref_type1 = Bsortedcns_line
else:
ref_type1 = Pileup_line
if not(rest):
filename, file_extension = os.path.splitext(output_file)
rest_output = filename + ".filter_out" + file_extension
else: rest_output = rest

filter_pileup_by_ref_file_comparing(pileup_input,ref_file,output_file,rest_output,ref_type=ref_type1,remove_non_matched=remove)
74 changes: 74 additions & 0 deletions Filtering/filter_pileup_by_reference_base_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#TODO refactor

class IllegalArgument(Exception):
def __init__(self):
self.message = "you should enter name of pileup file, output file and at least one Nucleotide for ref base"

def __str__(self):
return self.message

def __repr__(self):
return self.message


def filter_pileup_by_reference_base_set(pileup,output,listBases):
"""
:param pileup: the pile up we want to filter
:param listBases: a list of ref bases we want to filter by
:param output: the output file that will contain lines with only the bases in listBases as ref base
"""
with open(pileup,"r") as file1:
line = file1.readline()
with open(output,"w") as fileout:
while line is not None and line != "":
lineParts = line.split()
if lineParts[2] in listBases:
fileout.write(line)
line=file1.readline()



if __name__ == "__main__" :

"""
input : python filter_pileup_by_reference_base_set.py pileup_filename output_filename ref_bases_list
example : python filter_pileup_by_reference_base_set.py exmple.pileup output.pileup A G C
pileup_filename - the file we want to filter from by ref bases - to keep the lines with those ref bases
output_filename - the output file for the result filter
ref_bases_list - a list of ref bases we want to keep in the output file
output : the lines from the pileup_filename file that contain one of the ref_bases_list as ref base.
"""
import sys
if len(sys.argv)<3 : raise IllegalArgument
pileup = sys.argv[1]
output = sys.argv[2]

numberOfBases = len(sys.argv) -3
if numberOfBases == 0:
raise IllegalArgument
if numberOfBases > 0:
one = sys.argv[3]
if numberOfBases > 1:
two = sys.argv[4]
if numberOfBases > 2:
three = sys.argv[5]
if numberOfBases > 3:
four = sys.argv[6]
if numberOfBases > 4:
five = sys.argv[7]
filter_pileup_by_reference_base_set(pileup,output,[one,two,three,four,five])
else:
filter_pileup_by_reference_base_set(pileup, output,[one, two, three, four])
else:
filter_pileup_by_reference_base_set(pileup,output, [one, two, three])
else:
filter_pileup_by_reference_base_set(pileup,output, [one, two])
else:
filter_pileup_by_reference_base_set(pileup,output, [one])




117 changes: 117 additions & 0 deletions Filtering/filter_pileup_by_site_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Filter pileup by site list
Usage:
Filter_pileup_by_site_list param
Filter_pileup_by_site_list example
Filter_pileup_by_site_list <pileup_file> <out_file> <site_list>
Filter_pileup_by_site_list -h | --help
Options:
-h --help Show this screen
"""



import sys
from docopt import docopt
import re



def filter_pileup_by_site_list(pileup,output,listSites):
"""
:param pileup: the pile up we want to filter
:param listSites: a list of sites - (chr , pos) - we want to filter by
:param output: the output file that will contain lines with only the sites in listSites as ref base
"""
with open(pileup,"r") as file1:
line = file1.readline()
with open(output,"w") as fileout:
while line is not None and line != "":
lineParts = line.split()
if (lineParts[0],lineParts[1]) in listSites:
fileout.write(line)
line=file1.readline()



def param_description():
"""
Description for the script
"""
print("The parameters are\n" +
"pileup_file: the pileup file you want to filter\n" +
"out_file: the name of the output file\n" +
"site_list: the name of the file where each line is (char, pos) that we want to filter by\n")


def example_description():
"""
Usage example for the script
"""
print("Pileup input:\n" +
"seq1\t272\tA\t24\t,.$.....,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t\n" +
"seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t276\tT\t24\t,.$.....CCC,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t277\tT\t23\t,.....,,***.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t278\tT\t23\t,.....,,.,.AA.,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t279\tT\t23\t,...AA,,A,A,....,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t280\tG\t24\t,.$..A.AA,.AAA,,A.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t281\tT\t23\t,.....,,.,.,,,AA,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t286\tT\t24\t,.$.....+3ABC,,.,A,,,,AA.....\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t287\tT\t23\t,.....,-2AA,.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t289\tG\t23\tTTTTTTTTTT..TTTTTTTTT\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t" +
"Sites list file:\n" +
"seq1, 273\n seq1, 274\n seq1, 282\n seq1, 283\n seq1, 284\n seq1, 288\n seq1, 290\n seq2, 291\n" +
"Calling the script:\n" +
"python filter_pileup_by_site_list pileup_file out_file site_list\n" +
"And the output:\n" +
"seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" +"\n"+
"seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
"seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t"+ "\n" +
"seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t")



"""
:param pileup: the pile up we want to filter
:param output: the output file that will contain lines with only the sites in listSites as ref base
:param listSites: file where each line is (char, pos) that we want to filter by
:output: output file will contain only the lines in the given (chr, pos) positions
"""

if __name__ == "__main__" :
args = docopt(__doc__)
if args["param"]:
param_description()
sys.exit()

if args["example"]:
example_description()
sys.exit()

pileup = args['<pileup_file>']
output = args['<out_file>']
listSites = args['<site_list>']
listPos = []
with open(listSites,"r") as file1:
for item in file1:
iList=re.split( ",|\n" , item)
if len(iList)==1:
iList=item.split("\t")
listPos.append((iList[0],iList[1]))
filter_pileup_by_site_list(pileup,output,listPos)

Loading

0 comments on commit 95dd6a4

Please sign in to comment.