preparing for publication

junhull · Jun 9, 2021 · 95dd6a4 · 95dd6a4
1 parent e6e139a
commit 95dd6a4
Show file tree

Hide file tree

Showing 16 changed files with 1,102 additions and 4 deletions.
diff --git a/Experiments/forontiers_jupyter/bar_utils.py b/Experiments/forontiers_jupyter/bar_utils.py
@@ -51,6 +51,7 @@ def stacked_bar(data, series_labels=None, category_labels=None,
 
     if size_plot:
         fig = plt.figure(figsize=size_plot)
+    plt.rcParams['font.size'] = '20'
 
     suit_colors_dict = {}
     for index, column in enumerate(series_labels):
@@ -78,10 +79,10 @@ def stacked_bar(data, series_labels=None, category_labels=None,
         cum_size += row_data
 
     if not category_labels is None:
-        plt.xticks(ind2, category_labels, rotation=20, fontsize=15)
+        plt.xticks(ind2, category_labels, rotation=20, fontsize=30)
 
     if y_label != None:
-        plt.ylabel(y_label, fontsize=15)
+        plt.ylabel(y_label, fontsize=30)
 
     plt.legend()
 

diff --git a/Experiments/forontiers_jupyter/resic_run_file.py b/Experiments/forontiers_jupyter/resic_run_file.py
diff --git a/Filtering/filter_pileup_by_existing_snps.py b/Filtering/filter_pileup_by_existing_snps.py
@@ -0,0 +1,135 @@
+"""filter_pileup_by_existing_snp.py
+
+The script receives pileup file, cns file(/pileup file) and an output filename.
+it filter out the lines from the pileup file that overlapped with the cns file if there
+was a change in the read string (one at least of atcgATCG and the other IUPAC nucleotide code in the reads string ) of the cns file.
+by defult it keeps the lines that didnt overlap, if flag --rm-non-overlapping is added
+it filter those lines out.
+
+
+Usage:
+    snprationotincs.py param
+    snprationotincs.py example
+    snprationotincs.py <pileup_input> <cns_file> <output_file> [<output_file_rest>] [--rm-non-overlapping]
+    snprationotincs.py -h | --help
+
+Options:
+    -h --help                       Show this screen.
+    -r --rm-non-overlapping         Remove the lines that from pileup_input that didnt match no line in cns_file
+
+"""
+
+
+# the cns_file arg can be a pileup file or a Bsortedcns file #
+
+from Utility.parallel_generator import *
+from Utility.Pileup_class import Pileup_line
+import itertools
+from Utility.Bsotredcns_class import Bsortedcns_line
+from Utility.generators_utilities import *
+from docopt import docopt
+import os
+
+
+'''def get_pos_and_id(line):
+    """
+    @param line : a pileup line object
+    return : ref_base , gene_pos of the pileup line
+    """
+
+    return line.reference_id,line.gene_pos'''
+
+
+'''def is_with_any_change(pileup_line): 
+    """
+    @param pileup_line: a pileup line (object)
+    return : true if in the reads string there is one or more of a c g t A C G T , else false
+    """
+    #all the possible IUPAC nucleotide code
+    nuc_changes = ["a","c","g","t","r","y","s","w","k","m","b","d","h","v","n"]
+    all_nuc_changes = [letter.upper() for letter in nuc_changes] + nuc_changes
+
+    if any(change in pileup_line.reads() for change in all_nuc_changes):
+        return True
+    else : 
+        return False'''
+
+
+def filter_pileup_by_ref_file_comparing(pileup_input, ref_file, output_file,rest_output,ref_type=Pileup_line, remove_non_matched = False):
+    """
+    @param pileup_input: pileup file name
+    @param ref_file: pileup/Bsortedcns file name
+    @param output_file: output file name with path
+    @param rest_output: the name of the filterout file
+    @param ref_type: "pileup" or "Bsortedcns" file
+    @param remove_non_matched: if remove_non_matched = true it filter lines that didnt overlap out, else keeps them.
+    action : receives pileup file, cns file(/pileup file) and an output file.
+                it filter out the lines from the pileup file that overlapped with the cns file if there 
+                was a change in the read string (one at least of atcgATCG in the reads string ).
+                by defult it keeps the lines that didnt overlap, if remove_non_matched = true 
+                it filter those lines out.
+                the filter- out lines are printed to a file named output_file (without ending ) + "filter_out."+ (the ending)
+    """
+
+
+    #change_functor = is_with_any_change
+
+    with open(pileup_input, "r") as in_file, open(ref_file,"r") as ref, open(output_file, "w") as out, open(rest_output,"w") as out2 :
+
+        input_gen = class_generator(Pileup_line,file=in_file)
+        ref_gen = class_generator(ref_type,file=ref)
+
+        get_pos_and_id = lambda x : (x.reference_id,x.gene_pos)
+
+        parallel_gen = parallel_generator([input_gen,ref_gen],[get_pos_and_id,get_pos_and_id])
+
+        for [list_from_in_file, list_from_ref_file] in parallel_gen:
+                # when there is a line that matched the ref - check the changes, 
+                # if not changed then print the line to output
+            if (list_from_in_file is not None) and (list_from_ref_file is not None): # if there is overlap
+                if not(list_from_ref_file[0].is_with_any_change()): # when there is no change in the reads of the ref file (all "," or ".")
+                    out.write(str(list_from_in_file[0]) + "\n")
+                else : 
+                    out2.write(str(list_from_in_file[0]) + "\n")
+            elif (list_from_in_file is not None) and not(remove_non_matched):  # when not matched, just print (only if requested to print the non matched)
+                out.write(str(list_from_in_file[0]) + "\n")
+            elif (list_from_in_file is not None): # the removed files are printed to the filter_out file
+                out2.write(str(list_from_in_file[0]) + "\n")
+
+
+def print_params():
+    pass
+
+
+def print_example():
+    pass
+
+
+if __name__ == '__main__':
+    import sys
+    args = docopt(__doc__)
+    if args["param"]:
+        print_params()
+        sys.exit()
+
+    if args["example"]:
+        print_example()
+        sys.exit()
+
+        # Extracting the parameters
+    pileup_input = args['<pileup_input>']
+    ref_file = args['<cns_file>']
+    output_file = args['<output_file>']
+    remove = args['--rm-non-overlapping']
+    rest = args['<output_file_rest>']
+
+    if ref_file.split(".")[-1] == "Bsortedcns":
+        ref_type1 = Bsortedcns_line
+    else:
+        ref_type1 = Pileup_line
+    if not(rest):
+        filename, file_extension = os.path.splitext(output_file)
+        rest_output = filename + ".filter_out" + file_extension
+    else: rest_output = rest 
+
+    filter_pileup_by_ref_file_comparing(pileup_input,ref_file,output_file,rest_output,ref_type=ref_type1,remove_non_matched=remove)
diff --git a/Filtering/filter_pileup_by_reference_base_set.py b/Filtering/filter_pileup_by_reference_base_set.py
@@ -0,0 +1,74 @@
+#TODO refactor
+
+class IllegalArgument(Exception):
+    def __init__(self):
+        self.message = "you should enter name of pileup file, output file and at least one Nucleotide for ref base"
+
+    def __str__(self):
+        return self.message
+
+    def __repr__(self):
+        return self.message
+
+
+def filter_pileup_by_reference_base_set(pileup,output,listBases):
+    """
+    :param pileup:  the pile up we want to filter
+    :param listBases: a list of ref bases we want to filter by
+    :param output: the output file that will contain lines with only the bases in listBases as ref base
+    """
+    with open(pileup,"r") as file1:
+        line = file1.readline()
+        with open(output,"w") as fileout:
+            while line is not None and line != "":
+                lineParts = line.split()
+                if lineParts[2] in listBases:
+                    fileout.write(line)
+                line=file1.readline()
+
+
+
+if __name__ == "__main__" :
+
+    """
+    input : python filter_pileup_by_reference_base_set.py pileup_filename output_filename ref_bases_list
+            example : python filter_pileup_by_reference_base_set.py exmple.pileup output.pileup A G C
+
+            pileup_filename - the file we want to filter from by ref bases - to keep the lines with those ref bases
+            output_filename - the output file for the result filter
+            ref_bases_list - a list of ref bases we want to keep in the output file
+
+    output : the lines from the pileup_filename file that contain one of the ref_bases_list as ref base.
+
+    """
+    import sys
+    if len(sys.argv)<3 : raise IllegalArgument
+    pileup = sys.argv[1]
+    output = sys.argv[2]
+
+    numberOfBases = len(sys.argv) -3
+    if numberOfBases == 0: 
+        raise IllegalArgument
+    if numberOfBases > 0:
+        one = sys.argv[3]
+        if numberOfBases > 1:
+            two = sys.argv[4]
+            if numberOfBases > 2:
+                three = sys.argv[5]
+                if numberOfBases > 3:
+                    four = sys.argv[6]
+                    if numberOfBases > 4:
+                        five = sys.argv[7]
+                        filter_pileup_by_reference_base_set(pileup,output,[one,two,three,four,five])
+                    else:
+                        filter_pileup_by_reference_base_set(pileup, output,[one, two, three, four])
+                else:
+                    filter_pileup_by_reference_base_set(pileup,output, [one, two, three])
+            else:
+                filter_pileup_by_reference_base_set(pileup,output, [one, two])
+        else: 
+            filter_pileup_by_reference_base_set(pileup,output, [one])
+
+
+
+
diff --git a/Filtering/filter_pileup_by_site_list.py b/Filtering/filter_pileup_by_site_list.py
@@ -0,0 +1,117 @@
+"""Filter pileup by site list
+
+    Usage:
+        Filter_pileup_by_site_list param
+        Filter_pileup_by_site_list example
+        Filter_pileup_by_site_list <pileup_file> <out_file> <site_list>
+        Filter_pileup_by_site_list -h | --help
+
+    Options:
+        -h --help   Show this screen
+"""
+
+
+
+import sys
+from docopt import docopt
+import re
+
+
+
+def filter_pileup_by_site_list(pileup,output,listSites):
+    """
+    :param pileup:  the pile up we want to filter
+    :param listSites: a list of sites - (chr , pos) - we want to filter by
+    :param output: the output file that will contain lines with only the sites in listSites as ref base
+    """
+    with open(pileup,"r") as file1:
+        line = file1.readline()
+        with open(output,"w") as fileout:
+            while line is not None and line != "":
+                lineParts = line.split()
+                if (lineParts[0],lineParts[1]) in listSites:
+                    fileout.write(line)
+                line=file1.readline()
+
+
+
+def param_description():
+    """
+    Description for the script
+    """
+    print("The parameters are\n" +
+          "pileup_file: the pileup file you want to filter\n" +
+          "out_file: the name of the output file\n" +
+          "site_list: the name of the file where each line is (char, pos) that we want to filter by\n")
+
+
+def example_description():
+    """
+    Usage example for the script
+    """
+    print("Pileup input:\n" +
+          "seq1\t272\tA\t24\t,.$.....,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t\n" +
+          "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t276\tT\t24\t,.$.....CCC,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t277\tT\t23\t,.....,,***.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t278\tT\t23\t,.....,,.,.AA.,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t279\tT\t23\t,...AA,,A,A,....,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t280\tG\t24\t,.$..A.AA,.AAA,,A.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t281\tT\t23\t,.....,,.,.,,,AA,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t286\tT\t24\t,.$.....+3ABC,,.,A,,,,AA.....\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t287\tT\t23\t,.....,-2AA,.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t289\tG\t23\tTTTTTTTTTT..TTTTTTTTT\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+          "seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t" +
+          "Sites list file:\n" +
+          "seq1, 273\n seq1, 274\n seq1, 282\n seq1, 283\n seq1, 284\n seq1, 288\n seq1, 290\n seq2, 291\n" +
+          "Calling the script:\n" +
+          "python filter_pileup_by_site_list pileup_file out_file site_list\n" +
+          "And the output:\n" +
+          "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" +"\n"+
+		  "seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+		  "seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+		  "seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+		  "seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+		  "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" +
+		  "seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t"+ "\n" +
+		  "seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t")
+
+
+
+"""
+:param pileup:  the pile up we want to filter
+:param output: the output file that will contain lines with only the sites in listSites as ref base
+:param listSites: file where each line is (char, pos) that we want to filter by
+:output: output file will contain only the lines in the given (chr, pos) positions
+"""
+
+if __name__ == "__main__" :
+    args = docopt(__doc__)
+    if args["param"]:
+        param_description()
+        sys.exit()
+
+    if args["example"]:
+        example_description()
+        sys.exit()
+
+    pileup = args['<pileup_file>']
+    output = args['<out_file>']
+    listSites = args['<site_list>']
+    listPos = []
+    with open(listSites,"r") as file1:
+        for item in file1:
+            iList=re.split( ",|\n" , item)
+            if len(iList)==1:
+                iList=item.split("\t")
+            listPos.append((iList[0],iList[1]))
+    filter_pileup_by_site_list(pileup,output,listPos)
+