diff --git a/Experiments/forontiers_jupyter/bar_utils.py b/Experiments/forontiers_jupyter/bar_utils.py index ec3d76c..6d53199 100755 --- a/Experiments/forontiers_jupyter/bar_utils.py +++ b/Experiments/forontiers_jupyter/bar_utils.py @@ -51,6 +51,7 @@ def stacked_bar(data, series_labels=None, category_labels=None, if size_plot: fig = plt.figure(figsize=size_plot) + plt.rcParams['font.size'] = '20' suit_colors_dict = {} for index, column in enumerate(series_labels): @@ -78,10 +79,10 @@ def stacked_bar(data, series_labels=None, category_labels=None, cum_size += row_data if not category_labels is None: - plt.xticks(ind2, category_labels, rotation=20, fontsize=15) + plt.xticks(ind2, category_labels, rotation=20, fontsize=30) if y_label != None: - plt.ylabel(y_label, fontsize=15) + plt.ylabel(y_label, fontsize=30) plt.legend() diff --git a/Experiments/forontiers_jupyter/resic_run_file.py b/Experiments/forontiers_jupyter/resic_run_file.py old mode 100644 new mode 100755 diff --git a/Filtering/filter_pileup_by_existing_snps.py b/Filtering/filter_pileup_by_existing_snps.py new file mode 100644 index 0000000..a7195b7 --- /dev/null +++ b/Filtering/filter_pileup_by_existing_snps.py @@ -0,0 +1,135 @@ +"""filter_pileup_by_existing_snp.py + +The script receives pileup file, cns file(/pileup file) and an output filename. +it filter out the lines from the pileup file that overlapped with the cns file if there +was a change in the read string (one at least of atcgATCG and the other IUPAC nucleotide code in the reads string ) of the cns file. +by defult it keeps the lines that didnt overlap, if flag --rm-non-overlapping is added +it filter those lines out. + + +Usage: + snprationotincs.py param + snprationotincs.py example + snprationotincs.py [] [--rm-non-overlapping] + snprationotincs.py -h | --help + +Options: + -h --help Show this screen. + -r --rm-non-overlapping Remove the lines that from pileup_input that didnt match no line in cns_file + +""" + + +# the cns_file arg can be a pileup file or a Bsortedcns file # + +from Utility.parallel_generator import * +from Utility.Pileup_class import Pileup_line +import itertools +from Utility.Bsotredcns_class import Bsortedcns_line +from Utility.generators_utilities import * +from docopt import docopt +import os + + +'''def get_pos_and_id(line): + """ + @param line : a pileup line object + return : ref_base , gene_pos of the pileup line + """ + + return line.reference_id,line.gene_pos''' + + +'''def is_with_any_change(pileup_line): + """ + @param pileup_line: a pileup line (object) + return : true if in the reads string there is one or more of a c g t A C G T , else false + """ + #all the possible IUPAC nucleotide code + nuc_changes = ["a","c","g","t","r","y","s","w","k","m","b","d","h","v","n"] + all_nuc_changes = [letter.upper() for letter in nuc_changes] + nuc_changes + + if any(change in pileup_line.reads() for change in all_nuc_changes): + return True + else : + return False''' + + +def filter_pileup_by_ref_file_comparing(pileup_input, ref_file, output_file,rest_output,ref_type=Pileup_line, remove_non_matched = False): + """ + @param pileup_input: pileup file name + @param ref_file: pileup/Bsortedcns file name + @param output_file: output file name with path + @param rest_output: the name of the filterout file + @param ref_type: "pileup" or "Bsortedcns" file + @param remove_non_matched: if remove_non_matched = true it filter lines that didnt overlap out, else keeps them. + action : receives pileup file, cns file(/pileup file) and an output file. + it filter out the lines from the pileup file that overlapped with the cns file if there + was a change in the read string (one at least of atcgATCG in the reads string ). + by defult it keeps the lines that didnt overlap, if remove_non_matched = true + it filter those lines out. + the filter- out lines are printed to a file named output_file (without ending ) + "filter_out."+ (the ending) + """ + + + #change_functor = is_with_any_change + + with open(pileup_input, "r") as in_file, open(ref_file,"r") as ref, open(output_file, "w") as out, open(rest_output,"w") as out2 : + + input_gen = class_generator(Pileup_line,file=in_file) + ref_gen = class_generator(ref_type,file=ref) + + get_pos_and_id = lambda x : (x.reference_id,x.gene_pos) + + parallel_gen = parallel_generator([input_gen,ref_gen],[get_pos_and_id,get_pos_and_id]) + + for [list_from_in_file, list_from_ref_file] in parallel_gen: + # when there is a line that matched the ref - check the changes, + # if not changed then print the line to output + if (list_from_in_file is not None) and (list_from_ref_file is not None): # if there is overlap + if not(list_from_ref_file[0].is_with_any_change()): # when there is no change in the reads of the ref file (all "," or ".") + out.write(str(list_from_in_file[0]) + "\n") + else : + out2.write(str(list_from_in_file[0]) + "\n") + elif (list_from_in_file is not None) and not(remove_non_matched): # when not matched, just print (only if requested to print the non matched) + out.write(str(list_from_in_file[0]) + "\n") + elif (list_from_in_file is not None): # the removed files are printed to the filter_out file + out2.write(str(list_from_in_file[0]) + "\n") + + +def print_params(): + pass + + +def print_example(): + pass + + +if __name__ == '__main__': + import sys + args = docopt(__doc__) + if args["param"]: + print_params() + sys.exit() + + if args["example"]: + print_example() + sys.exit() + + # Extracting the parameters + pileup_input = args[''] + ref_file = args[''] + output_file = args[''] + remove = args['--rm-non-overlapping'] + rest = args[''] + + if ref_file.split(".")[-1] == "Bsortedcns": + ref_type1 = Bsortedcns_line + else: + ref_type1 = Pileup_line + if not(rest): + filename, file_extension = os.path.splitext(output_file) + rest_output = filename + ".filter_out" + file_extension + else: rest_output = rest + + filter_pileup_by_ref_file_comparing(pileup_input,ref_file,output_file,rest_output,ref_type=ref_type1,remove_non_matched=remove) \ No newline at end of file diff --git a/Filtering/filter_pileup_by_reference_base_set.py b/Filtering/filter_pileup_by_reference_base_set.py new file mode 100755 index 0000000..091c18f --- /dev/null +++ b/Filtering/filter_pileup_by_reference_base_set.py @@ -0,0 +1,74 @@ +#TODO refactor + +class IllegalArgument(Exception): + def __init__(self): + self.message = "you should enter name of pileup file, output file and at least one Nucleotide for ref base" + + def __str__(self): + return self.message + + def __repr__(self): + return self.message + + +def filter_pileup_by_reference_base_set(pileup,output,listBases): + """ + :param pileup: the pile up we want to filter + :param listBases: a list of ref bases we want to filter by + :param output: the output file that will contain lines with only the bases in listBases as ref base + """ + with open(pileup,"r") as file1: + line = file1.readline() + with open(output,"w") as fileout: + while line is not None and line != "": + lineParts = line.split() + if lineParts[2] in listBases: + fileout.write(line) + line=file1.readline() + + + +if __name__ == "__main__" : + + """ + input : python filter_pileup_by_reference_base_set.py pileup_filename output_filename ref_bases_list + example : python filter_pileup_by_reference_base_set.py exmple.pileup output.pileup A G C + + pileup_filename - the file we want to filter from by ref bases - to keep the lines with those ref bases + output_filename - the output file for the result filter + ref_bases_list - a list of ref bases we want to keep in the output file + + output : the lines from the pileup_filename file that contain one of the ref_bases_list as ref base. + + """ + import sys + if len(sys.argv)<3 : raise IllegalArgument + pileup = sys.argv[1] + output = sys.argv[2] + + numberOfBases = len(sys.argv) -3 + if numberOfBases == 0: + raise IllegalArgument + if numberOfBases > 0: + one = sys.argv[3] + if numberOfBases > 1: + two = sys.argv[4] + if numberOfBases > 2: + three = sys.argv[5] + if numberOfBases > 3: + four = sys.argv[6] + if numberOfBases > 4: + five = sys.argv[7] + filter_pileup_by_reference_base_set(pileup,output,[one,two,three,four,five]) + else: + filter_pileup_by_reference_base_set(pileup, output,[one, two, three, four]) + else: + filter_pileup_by_reference_base_set(pileup,output, [one, two, three]) + else: + filter_pileup_by_reference_base_set(pileup,output, [one, two]) + else: + filter_pileup_by_reference_base_set(pileup,output, [one]) + + + + diff --git a/Filtering/filter_pileup_by_site_list.py b/Filtering/filter_pileup_by_site_list.py new file mode 100755 index 0000000..c5e2a63 --- /dev/null +++ b/Filtering/filter_pileup_by_site_list.py @@ -0,0 +1,117 @@ +"""Filter pileup by site list + + Usage: + Filter_pileup_by_site_list param + Filter_pileup_by_site_list example + Filter_pileup_by_site_list + Filter_pileup_by_site_list -h | --help + + Options: + -h --help Show this screen +""" + + + +import sys +from docopt import docopt +import re + + + +def filter_pileup_by_site_list(pileup,output,listSites): + """ + :param pileup: the pile up we want to filter + :param listSites: a list of sites - (chr , pos) - we want to filter by + :param output: the output file that will contain lines with only the sites in listSites as ref base + """ + with open(pileup,"r") as file1: + line = file1.readline() + with open(output,"w") as fileout: + while line is not None and line != "": + lineParts = line.split() + if (lineParts[0],lineParts[1]) in listSites: + fileout.write(line) + line=file1.readline() + + + +def param_description(): + """ + Description for the script + """ + print("The parameters are\n" + + "pileup_file: the pileup file you want to filter\n" + + "out_file: the name of the output file\n" + + "site_list: the name of the file where each line is (char, pos) that we want to filter by\n") + + +def example_description(): + """ + Usage example for the script + """ + print("Pileup input:\n" + + "seq1\t272\tA\t24\t,.$.....,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t\n" + + "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t276\tT\t24\t,.$.....CCC,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t277\tT\t23\t,.....,,***.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t278\tT\t23\t,.....,,.,.AA.,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t279\tT\t23\t,...AA,,A,A,....,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t280\tG\t24\t,.$..A.AA,.AAA,,A.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t281\tT\t23\t,.....,,.,.,,,AA,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t286\tT\t24\t,.$.....+3ABC,,.,A,,,,AA.....\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t287\tT\t23\t,.....,-2AA,.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t289\tG\t23\tTTTTTTTTTT..TTTTTTTTT\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t" + + "Sites list file:\n" + + "seq1, 273\n seq1, 274\n seq1, 282\n seq1, 283\n seq1, 284\n seq1, 288\n seq1, 290\n seq2, 291\n" + + "Calling the script:\n" + + "python filter_pileup_by_site_list pileup_file out_file site_list\n" + + "And the output:\n" + + "seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" +"\n"+ + "seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + + "seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t"+ "\n" + + "seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t") + + + +""" +:param pileup: the pile up we want to filter +:param output: the output file that will contain lines with only the sites in listSites as ref base +:param listSites: file where each line is (char, pos) that we want to filter by +:output: output file will contain only the lines in the given (chr, pos) positions +""" + +if __name__ == "__main__" : + args = docopt(__doc__) + if args["param"]: + param_description() + sys.exit() + + if args["example"]: + example_description() + sys.exit() + + pileup = args[''] + output = args[''] + listSites = args[''] + listPos = [] + with open(listSites,"r") as file1: + for item in file1: + iList=re.split( ",|\n" , item) + if len(iList)==1: + iList=item.split("\t") + listPos.append((iList[0],iList[1])) + filter_pileup_by_site_list(pileup,output,listPos) + diff --git a/Filtering/filter_primary_secondary.py b/Filtering/filter_primary_secondary.py new file mode 100644 index 0000000..656ab74 --- /dev/null +++ b/Filtering/filter_primary_secondary.py @@ -0,0 +1,63 @@ +"""Filter Primary Secondary + +this script returns 2 filenames that are with ending ".secondary.fastq" and ".primary.fastq" +the secondary one contains the lines with seqs that their sizes are between 19 and 23 +the primary one contains the lines with seqs that their sizes are between 25 and 30 + +Usage: + filter_primary_secondary.py param + filter_primary_secondary.py example + filter_primary_secondary.py + filter_primary_secondary.py -h | --help + +Options: + -h --help show this screen + +""" + + +from Filtering.fastq_seq_size_filtering import * +import os +from docopt import docopt + +def filterToPrimarySecondaryLibs(fastqFilename): + """ + :param fastqFilename: the fastq we want to create primary and secondary libs from + :return: the primary and secondary filenames that we filtered by primary siRNA 25-30nt ,secondary siRNA 19-23nt + """ + + open(fastqFilename,'r').close() + + + fastqName = fastqFilename.split(".") + fastqName.pop(len(fastqName)-1) + mainName = ".".join(fastqName) + primary = mainName + ".primary.fastq" + secondary = mainName + ".secondary.fastq" + open(primary,"w").close() + open(secondary,"w").close() + non="non_temp.txt" + open(non,"w").close() + + filterFastqFileBySize(fastqFilename, 25, 30,primary, non) #primaryRange = (25,30) + filterFastqFileBySize(fastqFilename, 19, 23,secondary, non) #secondaryRange = (19,23) + + os.remove(non) + + return primary, secondary + + + + +if __name__ == "__main__" : + ''' + input : this script receives a fastq file that we would like to filter into 2 files by the size of the seq + output : this script returns 2 filenames that are with ending ".secondary.fastq" and ".primary.fastq" + the secondary one contains the lines with seqs that their sizes are between 19 and 23 + the primary one contains the lines with seqs that their sizes are between 25 and 30 + ''' + import sys + arguments = docopt(__doc__) + fastq_filename = arguments[''] + primary , secondary = filterToPrimarySecondaryLibs(fastq_filename) + diff --git a/Processing/sam_sorting.py b/Processing/sam_sorting.py new file mode 100755 index 0000000..1cce4c3 --- /dev/null +++ b/Processing/sam_sorting.py @@ -0,0 +1,114 @@ +""" sam_sorting. + +Usage: + sam_sorting.py param + sam_sorting.py example + sam_sorting.py + sam_sorting.py (-h | --help) + +Options: + -h --help Show this screen. + +""" +from Utility.generators_utilities import key_sorted_gen +from docopt import docopt +import re + + +# Exceptions + +class IllegalFile(Exception): + def __init__(self): + self.message = "File doesn't exists" + + def __str__(self): + return self.message + + def __repr__(self): + return self.message + +class IllegalFileWrapper(IllegalFile): + def __init__(self): + self.message = "File doesn't exists" + + def __str__(self): + return self.message + + def __repr__(self): + return self.message + + +""" +Function Object which used by the interval finder algorithm, that will extract the fields of the range_start and range_end from the sorted_sam file +:param line: the current line in the file +:return: (chromosome name,start range,end range) +""" +def sam_extract_range(line): + split_line = re.split('\t|\n',line) + if(line[0] == "@"): + return (str(float("-inf")), 0, 0) + elif (split_line[2] == "*"): # The misaligned lines will be in the end of the output file + return ("~",0,0) + else: + return (split_line[2],int(split_line[3]),int(split_line[3])+len(split_line[9])) + + +""" +:param sam_filename: the filename of the sam file +:return: output file called ${out_filename} that sorted mainly according to the chromosome number, and secondly according to the interval range +""" +def sam_sorted(sam_filename,out_filename): + # Sorting the file + with open(out_filename,"w") as output_file,\ + open(sam_filename,"r") as file: + """ + file_lines = file.readlines() + for line in file_lines: + if line[0] == "@": + output_file.write(line) + else: + break + for line in sorted(filter(lambda x: x[0] != "@", file_lines), key = sam_extract_range): + output_file.write(line) + """ + + for line in key_sorted_gen(sam_extract_range, file): + output_file.write(line) + + + +def parameters_description(): + string = "The parameters are:" + "\n"\ + + "sam_in_file: The name of the sam file that you would like to sort" + "\n"\ + + "sam_out_file: The name of the output (the sorted sam file)" + print (string) + +def example_description(): + input_string = "The input is:\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:26701:10507\t0\tchrI\t19959533\t255\t5M\t*\t0\t0\tCCGAC\tAAAAA\tXA:i:0\tMD:Z:123\tNM:i:0\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:11478:10492\t16\tchrI\t15070824\t255\t12M\t*\t0\t0\tAGCTCACGTTGA\tEEEEEEE/E/EE\tXA:i:0\tMD:Z:53G7C19A68\tNM:i:3\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:11478:10492\t16\tchrI\t15063627\t255\t10M\t*\t0\t0\tAGCTCACGTT\tEEEAAEAEEE\tXA:i:0\tMD:Z:53G7C19A68\tNM:i:3\n" + + output_string = "The output is:\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:11478:10492\t16\tchrI\t15063627\t255\t10M\t*\t0\t0\tAGCTCACGTT\tEEEAAEAEEE\tXA:i:0\tMD:Z:53G7C19A68\tNM:i:3\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:11478:10492\t16\tchrI\t15070824\t255\t12M\t*\t0\t0\tAGCTCACGTTGA\tEEEEEEE/E/EE\tXA:i:0\tMD:Z:53G7C19A68\tNM:i:3\n"\ + + "NB500948:31:HJYCCBGXX:1:11101:26701:10507\t0\tchrI\t19959533\t255\t5M\t*\t0\t0\tCCGAC\tAAAAA\tXA:i:0\tMD:Z:123\tNM:i:0\n" + + print (input_string) + print (output_string) + + +""" + +""" +if __name__ == "__main__": + arguments = docopt(__doc__) + + if arguments['param']: + parameters_description() + + if arguments['example']: + example_description() + + if (arguments[''] and arguments['']): + sam_sorted(arguments[''],arguments['']) diff --git a/README.md b/README.md index 287dc56..1700944 100644 --- a/README.md +++ b/README.md @@ -14,15 +14,31 @@ RESIC implements a graph aligner module that enables modular composition of new -### Installing and Running +### Installing ```bash # download the repo git clone https://github.com/Lammlab/Resic cd Resic -# add directories to PYTHONPATH +# install python dependencies +pip install requirements.txt + +# install samtools and bowtie + +# add modules to PYTHONPATH echo 'export PYTHONPATH="$PYTHONPATH:${HOME}/Resic"' > ~/.bashrc ``` +### Running + +To edit the function `real_pipe` in the following file and then run it following file + +```bash +python Experiments/forontiers_jupyter/resic_run_file.py +``` + +Examples of runs performed in the paper can be seen in the `Experiments/forontiers_jupyter` directory in files starting with `pipeline_test` . + +There is also a jupyter notebook is that directory that walks through the different steps of the pipeline. \ No newline at end of file diff --git a/Utility/Annotated_Sequence_Class.py b/Utility/Annotated_Sequence_Class.py new file mode 100755 index 0000000..b64005c --- /dev/null +++ b/Utility/Annotated_Sequence_Class.py @@ -0,0 +1,63 @@ + + +from abc import ABC, abstractmethod +from collections.abc import MutableMapping + +class SamLineTags(MutableMapping): + def __init__(self, *strings): + tags = [tag.split(':') for tag in strings[0]] + for tag, string in zip(tags, strings[0]): + if len(tag) is not 3: + raise ValueError("%s is not a legal sam line tag" % string) + self.dict = {tag_key: (tag_type, tag_val) for (tag_key, tag_type, tag_val) in tags} + + def __getitem__(self, item): + return self.dict[item][1] + + def __setitem__(self, key, value): + tag_type = 'i' if type(value) is int else 'Z' + self.dict[key] = (tag_type, value) + + def __delitem__(self, key): + del self.dict[key] + + def __iter__(self): + for key in self.dict.keys(): + yield key + + def __len__(self): + return len(self.dict) + + def __repr__(self): + str_tags = [':'.join([key, type, val]) for key, (type, val) in self.dict.items()] + return '\t'.join(str_tags) + +class Annotated_Sequence(ABC): + + def __init__(self, tags): + self.tags = SamLineTags(tags) + + @property + @abstractmethod + # (reference string,start,end) where start and end are relative to the 1st position in reference + def position(self):... + + @position.setter + @abstractmethod + def position(self):... + + + + def annotate(self,string): + # if we havent set an annotation field before + if not ('annotate_count' in self.__dict__): + self.annotate_count = sum('annotate' in tag for tag in self.tags) + + new_count = self.annotate_count + 1 + self.tags['annotate_%s' % str(new_count)] = string + + self.annotate_count = new_count + return + + @abstractmethod + def flip_strand(self):... diff --git a/Utility/Fastq_class.py b/Utility/Fastq_class.py new file mode 100755 index 0000000..7c822f5 --- /dev/null +++ b/Utility/Fastq_class.py @@ -0,0 +1,97 @@ +#Exceptions: Fastq class +class MyException(Exception): + def __init__(self): + self.message = '' + def __repr__(self): + return self.message + +class InValidFastQFile(MyException): + def __init__(self): + self.message='' +class NumOfLinesNotDivisibleBy4(InValidFastQFile): + def set_message(self, num_of_lines): + self.message = 'FastQ file has ' + str( + num_of_lines) + ' which is not divisible by 4 (the number of lines of each sequence)' + return self +class InValidSequence(InValidFastQFile): + def set_message(self, approx_line): + self.message = 'Invalid Sequence of FastQ File: Approximate line = ' + str(approx_line) + return self +class FirstLineNotStartWithAt(InValidSequence): + def set_message(self , line): + self.message += 'Line ' + str(line) + ' does not start with \'@\'' + return self +class ThirdLineNotStartWithPlus(InValidSequence): + def set_message(self,line): + self.message += 'Line ' + str(line) + ' does not start with \'+\'' + return self +class SecondAndForthLineNotSameSize(InValidSequence): + def set_message(self, line): + self.message += 'Lines ' + str(line+1) + ',' + str(line+3) + ' are not of the same length' + return self + +class WrongUsage_FastQClass(MyException): + def __init__(self): + self.message = 'Wrong Usage of FastQ Class:' +class InValidCuttingIndices(WrongUsage_FastQClass): + def __init__(self): + self.message = 'Invalid Filtering Indices' + +class Fastq: + """ + first string - identifier + second string - sequence + third string - identifier (may be different than first line) + forth line - confidence (of each nucleotide in the sequence) + """ + + def __init__(self, strings): + """ + :param strings: The 4 lines (as a list of strings) which composes each fastq sequence + """ + if strings[0][0] != "@": + raise FirstLineNotStartWithAt + if strings[2][0] != "+": + raise ThirdLineNotStartWithPlus + if len(strings[1]) != len(strings[3]): + raise SecondAndForthLineNotSameSize + strings = [item.strip("\n") for item in strings] + self.strings = strings + def __repr__(self): + return "\n".join(self.strings) + "\n" + #TODO: removed newlines between lines, remember to fix + def cut_seq(self, start_index, end_index): + """ + :return: FastQ object, but the sequence and the quality score are cut according to the indices + """ + if not isinstance(start_index,int) or not isinstance(end_index,int) or end_index <= start_index: + raise InValidCuttingIndices + temp = Fastq([self.strings[0] , + self.strings[1][start_index:end_index] , + self.strings[2] , + self.strings[3][start_index:end_index]]) + return temp + + def get_seq(self): + return self.strings[1] + def get_id(self): + return self.strings[0].split("@")[1] + + + def get_id(self): + return self.strings[0] + + def __getitem__(self, slice): + """ + :param slice: A slice [start:end] of 2 integers specifying a range in the sequence + :return: a FastQ sequence cut in this manner: 1st,3rd lines remain the same; 2nd,4th lines are cut according to [start:end] + """ + try: + return self.cut_seq(slice.start,slice.stop) + except InValidCuttingIndices as e: + raise e + def __len__(self): + """ + :return: Length of nucleutide sequence + """ + return len(self.strings[1]) \ No newline at end of file diff --git a/Utility/Samfile_class.py b/Utility/Samfile_class.py new file mode 100755 index 0000000..e4871c2 --- /dev/null +++ b/Utility/Samfile_class.py @@ -0,0 +1,180 @@ +import re +import tempfile +from collections.abc import MutableMapping +from Utility.generators_utilities import getLineFromChunk +from Utility.temp_files import * + +class SamLineTags(MutableMapping): + def __init__(self, *strings): + tags = [tag.split(':') for tag in strings] + for tag, string in zip(tags, strings): + if len(tag) is not 3: + raise ValueError("%s is not a legal sam line tag" % string) + self.dict = {tag_key: (tag_type, tag_val) for (tag_key, tag_type, tag_val) in tags} + + def __getitem__(self, item): + return self.dict[item][1] + + def __setitem__(self, key, value): + tag_type = 'i' if type(value) is int else 'Z' + self.dict[key] = (tag_type, value) + + def __delitem__(self, key): + del self.dict[key] + + def __iter__(self): + for key in self.dict.keys(): + yield key + + def __len__(self): + return len(self.dict) + + def __repr__(self): + str_tags = [':'.join([key, type, val]) for key, (type, val) in self.dict.items()] + return '\t'.join(str_tags) + + +class SamLine: + # TODO - document this class - need to write how we can get the class fields + fieldNames = { + "seq_id": 0, + "flags": 1, + "reference_id": 2, + "_gene_pos": 3, + "mapq": 4, + "cigar": 5, + "rnext": 6, + "pnext": 7, + "tlen": 8, + "sequence": 9, + "quality": 10, + } + + def __init__(self, line): + line = line.strip('\n') + fields = re.split('\t|\n', line) + # dictionary = {key: fields[index] for (key, index) in self.fieldNames.items()} + + # self.__dict__ = dictionary + + self.seq_id = fields[0] + self._flags = fields[1] #int + self.reference_id = fields[2] + self._gene_pos = fields[3] #int + self._mapq = fields[4] #int + self.cigar = fields[5] + self.rnext = fields[6] + self._pnext = fields[7] # int + self._tlen = fields[8] #int + self.sequence = fields[9] + self.quality = fields[10] + + # create tags that act like dict + raw_tags = fields[11:] + self.tags = SamLineTags(*raw_tags) + + # + + def __repr__(self): + # fields = [self.__dict__[i] for i in self.fieldNames.keys()] + fields = [self.seq_id, self._flags, self.reference_id, self._gene_pos, self._mapq, self.cigar, self.rnext, self._pnext, self._tlen, self.sequence, self.quality] + fields.append(str(self.tags)) + return '\t'.join(fields) + + def is_alligned(self): + # true if the reference is not * + return self.reference_id is not '*' + + def strand(self): + """ returns the sense of the sequence """ + # logic: if bit 5 (16) is on, this will become -1 + # else it will be one + if int(self.flags) & 16 == 16: + return -1 + else: + return 1 + + def annotate(self, string): + # if we havnt set an annotation field before + if not ('annotate_count' in self.__dict__): + self.annotate_count = sum('annotate' in tag for tag in self.tags) + + new_count = self.annotate_count + 1 + self.tags['annotate_%s' % str(new_count)] = string + + self.annotate_count = new_count + return + + # use the position method to get reference_id, sequence start position, and sequence end position + def position(self): + return self.reference_id, int(self._gene_pos), int(self._gene_pos) + len(self.sequence) + + def sequence_id(self): + return self.seq_id + + @classmethod + def is_header(cls, line): + return line[0] is '@' + + # TODO move this to Annotated_sequence + @property + def gene_pos(self): + return int(self._gene_pos) + @gene_pos.setter + def gene_pos(self, pos): + self._gene_pos = str(pos) + @property + def pnext(self): + return int(self._pnext) + @pnext.setter + def pnext(self, next_pos): + self._pnext = str(next_pos) + @property + def flags(self): + return int(self._flags) + @flags.setter + def flags(self, flag): + self._flags = str(flag) + @property + def mapq(self): + return int(self._mapq) + @mapq.setter + def mapq(self, quality): + self._mapq = str(quality) + @property + def tlen(self): + return int(self._tlen) + @tlen.setter + def tlen(self, length): + self._tlen = str(length) + +""" +:param sam_filename: filename of sam file +:return: 3 output files: 1. header lines file called ${header_filename} + 2. aligned lines file called ${aligned_filename} + 3. misaligned lines file called ${misaligned_filename} +""" + + +def split_sam(sam_filename): + + + header_file = open(new_temp_file(),"w+") #tempfile.NamedTemporaryFile(prefix="sam_splitting", dir="/tmp", mode='w+', delete=False) + aligned_file = open(new_temp_file(),"w+") #tempfile.NamedTemporaryFile(prefix="sam_splitting", dir="/tmp", mode='w+', delete=False) + misaligned_file = open(new_temp_file(),"w+") #tempfile.NamedTemporaryFile(prefix="sam_splitting", dir="/tmp", mode='w+', delete=False) + + with open(sam_filename, "r") as file: + gen = getLineFromChunk(file) + for line in gen: + if line[0] == "@": # header sam lines + header_file.write(line) + elif re.split('\t|\n', line)[2] == "*": # If the chr_name field is * than the line is misaligned + misaligned_file.write(line) + else: + aligned_file.write(line) + + header_file.close() + aligned_file.close() + misaligned_file.close() + + return (header_file.name, aligned_file.name, misaligned_file.name) diff --git a/Utility/count_utils.py b/Utility/count_utils.py new file mode 100644 index 0000000..4c9e14b --- /dev/null +++ b/Utility/count_utils.py @@ -0,0 +1,30 @@ +import pandas as pd + +def column_delta_df(df:pd.DataFrame)->pd.DataFrame: + """ + returns a DataFrame where each colum j but the last takes the value of col[j]-col[j-1] + :param df: a pandas df contatining only numeric values + :return: a pandas df + """ + delta=pd.DataFrame.copy(df,deep=True) + + delta.iloc[:,1:]=df.values[:,1:] - df.values[:,:-1] + return delta + +def get_line_count(filename): + """ + count number of lines in file. + taken from + https://stackoverflow.com/a/27518377 + :param filename: file name + :return: number of lines in file + """ + def _make_gen(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) + + f = open(filename, 'rb') + f_gen = _make_gen(f.raw.read) + return sum(buf.count(b'\n') for buf in f_gen) diff --git a/Utility/strip_last_newline.py b/Utility/strip_last_newline.py new file mode 100755 index 0000000..320ec94 --- /dev/null +++ b/Utility/strip_last_newline.py @@ -0,0 +1,26 @@ + +import os + + +def strip_last_NL(in_file): + + if os.path.getsize(in_file) <= 2: + return # file too small + with open(in_file,"a+b") as fp: + #get last 2 bytes + fp.seek(-2,2) + bytes=fp.read(2) + last_chars=[chr(b) for b in bytes] + + # 2 newline chars at end + if set(last_chars) == {'\n','\r'}: + truncate = 2 + # 1 newline char at end + elif last_chars[1] == '\n' or last_chars[1] == '\r': + truncate = 1 + #no newline chars at end + else : + truncate=0 + # note we are at end of file here so tell gives as size of file + fp.truncate(fp.tell() - truncate ) + diff --git a/Utility/temp_files.py b/Utility/temp_files.py new file mode 100755 index 0000000..430f66f --- /dev/null +++ b/Utility/temp_files.py @@ -0,0 +1,21 @@ + +import tempfile +import os +import socket + +socket_debug=False + +def new_temp_file(dir='/tmp/',*args,**kwargs): + #TODO fix to check tmp dir with os + if ( dir == '/tmp/' or dir == '/tmp' ) and socket.gethostname() == 'akalton': + dir = '/Bigdata/tmp/' + + if socket_debug: + with open("/tmp/socket_log.log",'a') as logfp: + logfp.write("socket is %s, dir is %s \n"%(socket.gethostname(),dir) ) + + if not os.path.isdir(dir): + os.makedirs(dir) + os_handle,name=tempfile.mkstemp(dir=dir,*args,**kwargs) + os.close(os_handle) + return name diff --git a/Utility/threading_tree_utils.py b/Utility/threading_tree_utils.py new file mode 100755 index 0000000..a32f71b --- /dev/null +++ b/Utility/threading_tree_utils.py @@ -0,0 +1,161 @@ + +from contextlib import contextmanager +import multiprocessing + +from typing import Dict +from typing import List +from collections import namedtuple +import copy + +Fathers_list = List[str] +Specifications = namedtuple('Specifications', ['id', 'flags', 'pre_function', 'post_function']) +Dict_specifications = Dict[Specifications, Fathers_list] + +""" this class handles the waiting of the child vertexes to thier fathers, +handles the dict of filenames that contains the "output" file of each step for it's children , +and in general handles the tree structure by the given Dict_specifications """ +class ThreadingTree: + + def __init__(self, dict_spec: Dict_specifications): + self.dict_tree = copy.deepcopy(dict_spec) + + if not self.is_not_circular_dependency(dict_spec): + raise ValueError("The dictionary of specification must have a step without prior dependencies") + + # filling dict of events for each vertex of "tree" + self.events_by_id_dict = {spec.id: multiprocessing.Event() for spec in dict_spec.keys()} + self.filenames_dict = multiprocessing.Manager().dict({spec.id: None for spec in dict_spec.keys()}) + self.filenames_dict_lock = multiprocessing.Lock() + self.fasta_indexing_lock = multiprocessing.Lock() + self.secondary_fasta_indexing_lock = multiprocessing.Lock() + + def is_not_circular_dependency(self, dict_spec: Dict_specifications): + """ + + :param dict_spec: + :return: True if there is a head to start the threading from + """ + hist_dict = dict() + for tuple in dict_spec.keys(): + + id = tuple[0] + if id not in hist_dict: + hist_dict[id] = 1 + else: + hist_dict[id] = hist_dict[id] + 1 + for id, number in hist_dict.items(): + if number == 1: + return True + + return False + + @contextmanager + def threading_context(self, id_step): + """ + this function is used with a with fraze : "with ThreadingTree.threading_context(id): ... " + when the with begin the part before the yield is executed, then the with scope content is being preformed and + when the with scope is done the part at the finally is being executed. + + this function handles the waiting of the child vertex to it's fathers. + :param id_step: the id of the vertex in the tree + :return: + """ + # here need to come the enter part - wait for father on the event and so on + for spec, fathers in self.dict_tree.items(): + if spec.id ==id_step: + for father in fathers: + self.events_by_id_dict[father].wait() # waiting on all fathers + + try: + yield + finally: # here is exit + self.events_by_id_dict[id_step].set() # setting the curr event - because the action is done + pass + + @contextmanager + def locking_context(self): + """ + locking for the sake that no two threads will try to do bowtie build at the same time and collapse + """ + self.fasta_indexing_lock.acquire(True) + try: + yield + finally: # here is exit + self.fasta_indexing_lock.release() + pass + + + @contextmanager + def secondary_locking_context(self): + """ + locking for the sake that no two threads will try to do bowtie build at the same time and collapse + """ + self.secondary_fasta_indexing_lock.acquire(True) + try: + yield + finally: # here is exit + self.secondary_fasta_indexing_lock.release() + pass + + def set_filename_of_finished_alignment(self, id_step, name_of_file): + """ + this sets the filename for the curr step , for the use of its child vertexes + :param name_of_file: the name of the output file from the id_step vertex + :return: + """ + + self.filenames_dict_lock.acquire(True) + + self.filenames_dict[id_step] = name_of_file + + self.filenames_dict_lock.release() + + def get_fathers_and_spec(self, step_id): + """ + :param step_id: + :return: fathers list and the spec (Specifications) of the curr step_id , like it appeared in the dict_spec + that was originally given to the class + """ + for spec, fathers in self.dict_tree.items(): + if spec.id == step_id: + return copy.deepcopy(fathers), copy.deepcopy(spec) + return [], None + + def get_previous_steps_file_names(self, id_step): + + filenames = [] + fathers, spec = self.get_fathers_and_spec(id_step) + for father in fathers: + filenames.append(copy.deepcopy(self.filenames_dict[father])) + return filenames + + def get_step_filename(self, id_step): + return copy.deepcopy(self.filenames_dict[id_step]) + + def get_id_of_leafs(self): + """ + :return: a list of the id's of the leafs children of the graph + """ + hist_dict = dict() + for tuple1, fathers in self.dict_tree.items(): + for father in fathers: + if father not in hist_dict: + hist_dict[father] = 1 + else: + hist_dict[father] = hist_dict[father] + 1 + leafs_list = [] + for tuple1, fathers in self.dict_tree.items(): + if tuple1[0] not in hist_dict.keys(): + leafs_list.append(copy.deepcopy(tuple1[0])) + + return leafs_list + + def get_filenames_of_leafs(self): + """ + :return: the filenames the leaf vertex of the tree set as their filenames results + """ + res = [] + for id1 in self.get_id_of_leafs(): + res.append(copy.deepcopy(self.filenames_dict[id1])) + return res + diff --git a/__init__.py b/__init__.py new file mode 100755 index 0000000..e69de29