forked from Lammlab/Resic
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
1,102 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
"""filter_pileup_by_existing_snp.py | ||
The script receives pileup file, cns file(/pileup file) and an output filename. | ||
it filter out the lines from the pileup file that overlapped with the cns file if there | ||
was a change in the read string (one at least of atcgATCG and the other IUPAC nucleotide code in the reads string ) of the cns file. | ||
by defult it keeps the lines that didnt overlap, if flag --rm-non-overlapping is added | ||
it filter those lines out. | ||
Usage: | ||
snprationotincs.py param | ||
snprationotincs.py example | ||
snprationotincs.py <pileup_input> <cns_file> <output_file> [<output_file_rest>] [--rm-non-overlapping] | ||
snprationotincs.py -h | --help | ||
Options: | ||
-h --help Show this screen. | ||
-r --rm-non-overlapping Remove the lines that from pileup_input that didnt match no line in cns_file | ||
""" | ||
|
||
|
||
# the cns_file arg can be a pileup file or a Bsortedcns file # | ||
|
||
from Utility.parallel_generator import * | ||
from Utility.Pileup_class import Pileup_line | ||
import itertools | ||
from Utility.Bsotredcns_class import Bsortedcns_line | ||
from Utility.generators_utilities import * | ||
from docopt import docopt | ||
import os | ||
|
||
|
||
'''def get_pos_and_id(line): | ||
""" | ||
@param line : a pileup line object | ||
return : ref_base , gene_pos of the pileup line | ||
""" | ||
return line.reference_id,line.gene_pos''' | ||
|
||
|
||
'''def is_with_any_change(pileup_line): | ||
""" | ||
@param pileup_line: a pileup line (object) | ||
return : true if in the reads string there is one or more of a c g t A C G T , else false | ||
""" | ||
#all the possible IUPAC nucleotide code | ||
nuc_changes = ["a","c","g","t","r","y","s","w","k","m","b","d","h","v","n"] | ||
all_nuc_changes = [letter.upper() for letter in nuc_changes] + nuc_changes | ||
if any(change in pileup_line.reads() for change in all_nuc_changes): | ||
return True | ||
else : | ||
return False''' | ||
|
||
|
||
def filter_pileup_by_ref_file_comparing(pileup_input, ref_file, output_file,rest_output,ref_type=Pileup_line, remove_non_matched = False): | ||
""" | ||
@param pileup_input: pileup file name | ||
@param ref_file: pileup/Bsortedcns file name | ||
@param output_file: output file name with path | ||
@param rest_output: the name of the filterout file | ||
@param ref_type: "pileup" or "Bsortedcns" file | ||
@param remove_non_matched: if remove_non_matched = true it filter lines that didnt overlap out, else keeps them. | ||
action : receives pileup file, cns file(/pileup file) and an output file. | ||
it filter out the lines from the pileup file that overlapped with the cns file if there | ||
was a change in the read string (one at least of atcgATCG in the reads string ). | ||
by defult it keeps the lines that didnt overlap, if remove_non_matched = true | ||
it filter those lines out. | ||
the filter- out lines are printed to a file named output_file (without ending ) + "filter_out."+ (the ending) | ||
""" | ||
|
||
|
||
#change_functor = is_with_any_change | ||
|
||
with open(pileup_input, "r") as in_file, open(ref_file,"r") as ref, open(output_file, "w") as out, open(rest_output,"w") as out2 : | ||
|
||
input_gen = class_generator(Pileup_line,file=in_file) | ||
ref_gen = class_generator(ref_type,file=ref) | ||
|
||
get_pos_and_id = lambda x : (x.reference_id,x.gene_pos) | ||
|
||
parallel_gen = parallel_generator([input_gen,ref_gen],[get_pos_and_id,get_pos_and_id]) | ||
|
||
for [list_from_in_file, list_from_ref_file] in parallel_gen: | ||
# when there is a line that matched the ref - check the changes, | ||
# if not changed then print the line to output | ||
if (list_from_in_file is not None) and (list_from_ref_file is not None): # if there is overlap | ||
if not(list_from_ref_file[0].is_with_any_change()): # when there is no change in the reads of the ref file (all "," or ".") | ||
out.write(str(list_from_in_file[0]) + "\n") | ||
else : | ||
out2.write(str(list_from_in_file[0]) + "\n") | ||
elif (list_from_in_file is not None) and not(remove_non_matched): # when not matched, just print (only if requested to print the non matched) | ||
out.write(str(list_from_in_file[0]) + "\n") | ||
elif (list_from_in_file is not None): # the removed files are printed to the filter_out file | ||
out2.write(str(list_from_in_file[0]) + "\n") | ||
|
||
|
||
def print_params(): | ||
pass | ||
|
||
|
||
def print_example(): | ||
pass | ||
|
||
|
||
if __name__ == '__main__': | ||
import sys | ||
args = docopt(__doc__) | ||
if args["param"]: | ||
print_params() | ||
sys.exit() | ||
|
||
if args["example"]: | ||
print_example() | ||
sys.exit() | ||
|
||
# Extracting the parameters | ||
pileup_input = args['<pileup_input>'] | ||
ref_file = args['<cns_file>'] | ||
output_file = args['<output_file>'] | ||
remove = args['--rm-non-overlapping'] | ||
rest = args['<output_file_rest>'] | ||
|
||
if ref_file.split(".")[-1] == "Bsortedcns": | ||
ref_type1 = Bsortedcns_line | ||
else: | ||
ref_type1 = Pileup_line | ||
if not(rest): | ||
filename, file_extension = os.path.splitext(output_file) | ||
rest_output = filename + ".filter_out" + file_extension | ||
else: rest_output = rest | ||
|
||
filter_pileup_by_ref_file_comparing(pileup_input,ref_file,output_file,rest_output,ref_type=ref_type1,remove_non_matched=remove) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#TODO refactor | ||
|
||
class IllegalArgument(Exception): | ||
def __init__(self): | ||
self.message = "you should enter name of pileup file, output file and at least one Nucleotide for ref base" | ||
|
||
def __str__(self): | ||
return self.message | ||
|
||
def __repr__(self): | ||
return self.message | ||
|
||
|
||
def filter_pileup_by_reference_base_set(pileup,output,listBases): | ||
""" | ||
:param pileup: the pile up we want to filter | ||
:param listBases: a list of ref bases we want to filter by | ||
:param output: the output file that will contain lines with only the bases in listBases as ref base | ||
""" | ||
with open(pileup,"r") as file1: | ||
line = file1.readline() | ||
with open(output,"w") as fileout: | ||
while line is not None and line != "": | ||
lineParts = line.split() | ||
if lineParts[2] in listBases: | ||
fileout.write(line) | ||
line=file1.readline() | ||
|
||
|
||
|
||
if __name__ == "__main__" : | ||
|
||
""" | ||
input : python filter_pileup_by_reference_base_set.py pileup_filename output_filename ref_bases_list | ||
example : python filter_pileup_by_reference_base_set.py exmple.pileup output.pileup A G C | ||
pileup_filename - the file we want to filter from by ref bases - to keep the lines with those ref bases | ||
output_filename - the output file for the result filter | ||
ref_bases_list - a list of ref bases we want to keep in the output file | ||
output : the lines from the pileup_filename file that contain one of the ref_bases_list as ref base. | ||
""" | ||
import sys | ||
if len(sys.argv)<3 : raise IllegalArgument | ||
pileup = sys.argv[1] | ||
output = sys.argv[2] | ||
|
||
numberOfBases = len(sys.argv) -3 | ||
if numberOfBases == 0: | ||
raise IllegalArgument | ||
if numberOfBases > 0: | ||
one = sys.argv[3] | ||
if numberOfBases > 1: | ||
two = sys.argv[4] | ||
if numberOfBases > 2: | ||
three = sys.argv[5] | ||
if numberOfBases > 3: | ||
four = sys.argv[6] | ||
if numberOfBases > 4: | ||
five = sys.argv[7] | ||
filter_pileup_by_reference_base_set(pileup,output,[one,two,three,four,five]) | ||
else: | ||
filter_pileup_by_reference_base_set(pileup, output,[one, two, three, four]) | ||
else: | ||
filter_pileup_by_reference_base_set(pileup,output, [one, two, three]) | ||
else: | ||
filter_pileup_by_reference_base_set(pileup,output, [one, two]) | ||
else: | ||
filter_pileup_by_reference_base_set(pileup,output, [one]) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
"""Filter pileup by site list | ||
Usage: | ||
Filter_pileup_by_site_list param | ||
Filter_pileup_by_site_list example | ||
Filter_pileup_by_site_list <pileup_file> <out_file> <site_list> | ||
Filter_pileup_by_site_list -h | --help | ||
Options: | ||
-h --help Show this screen | ||
""" | ||
|
||
|
||
|
||
import sys | ||
from docopt import docopt | ||
import re | ||
|
||
|
||
|
||
def filter_pileup_by_site_list(pileup,output,listSites): | ||
""" | ||
:param pileup: the pile up we want to filter | ||
:param listSites: a list of sites - (chr , pos) - we want to filter by | ||
:param output: the output file that will contain lines with only the sites in listSites as ref base | ||
""" | ||
with open(pileup,"r") as file1: | ||
line = file1.readline() | ||
with open(output,"w") as fileout: | ||
while line is not None and line != "": | ||
lineParts = line.split() | ||
if (lineParts[0],lineParts[1]) in listSites: | ||
fileout.write(line) | ||
line=file1.readline() | ||
|
||
|
||
|
||
def param_description(): | ||
""" | ||
Description for the script | ||
""" | ||
print("The parameters are\n" + | ||
"pileup_file: the pileup file you want to filter\n" + | ||
"out_file: the name of the output file\n" + | ||
"site_list: the name of the file where each line is (char, pos) that we want to filter by\n") | ||
|
||
|
||
def example_description(): | ||
""" | ||
Usage example for the script | ||
""" | ||
print("Pileup input:\n" + | ||
"seq1\t272\tA\t24\t,.$.....,,.,.G...G,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t\n" + | ||
"seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t275\tT\t23\t,.....,,AAA,GG..,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t276\tT\t24\t,.$.....CCC,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t277\tT\t23\t,.....,,***.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t278\tT\t23\t,.....,,.,.AA.,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t279\tT\t23\t,...AA,,A,A,....,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t280\tG\t24\t,.$..A.AA,.AAA,,A.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t281\tT\t23\t,.....,,.,.,,,AA,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t285\tG\t23\tAAAAAACCCCC,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t286\tT\t24\t,.$.....+3ABC,,.,A,,,,AA.....\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t287\tT\t23\t,.....,-2AA,.,.,..,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t289\tG\t23\tTTTTTTTTTT..TTTTTTTTT\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t" + | ||
"Sites list file:\n" + | ||
"seq1, 273\n seq1, 274\n seq1, 282\n seq1, 283\n seq1, 284\n seq1, 288\n seq1, 290\n seq2, 291\n" + | ||
"Calling the script:\n" + | ||
"python filter_pileup_by_site_list pileup_file out_file site_list\n" + | ||
"And the output:\n" + | ||
"seq1\t273\tC\t23\t,.....,,.,A,..,.,AAA\t<<<<<<<<<<<<<<<<<<<<\t" +"\n"+ | ||
"seq1\t274\tT\t24\t,.$.....,,.,.,,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t282\tT\t24\t,.$.....,,.,..,,,.,..^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t283\tT\t23\t,.....,AAA.,CCCC,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t284\tT\t23\t,.....,,.,.,,,,.,..A\t<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t288\tT\t24\t,.$AAAAAAAAA.AAAAAAA^+.\t<<<<<<<<<<<<<<<<<<<<<\t" + "\n" + | ||
"seq1\t290\tC\t23\tGG$GGGGGGGGGGGGGGGGG\t<<<<<<<<<<<<<<<<<<<<\t"+ "\n" + | ||
"seq2\t291\tC\t23\t...$GGGGGG....A......\t<<<<<<<<<<<<<<<<<<<<\t") | ||
|
||
|
||
|
||
""" | ||
:param pileup: the pile up we want to filter | ||
:param output: the output file that will contain lines with only the sites in listSites as ref base | ||
:param listSites: file where each line is (char, pos) that we want to filter by | ||
:output: output file will contain only the lines in the given (chr, pos) positions | ||
""" | ||
|
||
if __name__ == "__main__" : | ||
args = docopt(__doc__) | ||
if args["param"]: | ||
param_description() | ||
sys.exit() | ||
|
||
if args["example"]: | ||
example_description() | ||
sys.exit() | ||
|
||
pileup = args['<pileup_file>'] | ||
output = args['<out_file>'] | ||
listSites = args['<site_list>'] | ||
listPos = [] | ||
with open(listSites,"r") as file1: | ||
for item in file1: | ||
iList=re.split( ",|\n" , item) | ||
if len(iList)==1: | ||
iList=item.split("\t") | ||
listPos.append((iList[0],iList[1])) | ||
filter_pileup_by_site_list(pileup,output,listPos) | ||
|
Oops, something went wrong.