-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_m8.py
85 lines (69 loc) · 3.89 KB
/
parse_m8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3.10
# Author: Giorgia Del Missier
import argparse
import json
# Argument parser setup
parser = argparse.ArgumentParser(description="Parse Foldseek results and normalize bitscores.")
parser.add_argument('--input', required=True, help='Input: Foldseek results (.m8 file path)')
parser.add_argument('--input_bh', required=True, help='Input: Foldseek reciprocal search results (.m8 file path)')
parser.add_argument('--input_normalisation', required=True, help='Input: Foldseek results for bitscore normalization (.m8 file path)')
parser.add_argument('--input_normalisation_bh', required=True, help='Input: Foldseek best reciprocals results for bitscore normalization (.m8 file path)')
parser.add_argument('--proteome_size', required=True, type=int, help="Proteome size (i.e., number of AlphaFold models) of selected taxid")
parser.add_argument('--eval_thr', required=True, type=float, help='E-value threshold')
parser.add_argument('--bits_thr', required=True, type=int, help='Bitscore threshold')
args = parser.parse_args()
def parse_fs(fin, fnormalisation, eval_thr, bits_thr):
"""
Parse Foldseek output files and normalize bitscores.
Parameters:
- fin: Path to the input file with Foldseek results
- fnormalisation: Path to the input file with Foldseek results for normalization
- eval_thr: E-value threshold
- bits_thr: Bitscore threshold
Returns:
- Dictionary of queries and their significant hits
"""
queries = dict()
normalisation = dict()
# Read normalization file
with open(fnormalisation) as fnorm:
for line in fnorm:
line = line.strip().split()
query, target, evalue, bitscore = line[0], line[1], float(line[-2]), int(line[-1])
# Extract query and target IDs from headers
query = query.split("-")[1] if "-" in query else query
query = query.split(".gz")[0][:-4] if ".gz" in query else query
target = target.split("-")[1] if "-" in target else target
target = target.split(".gz")[0][:-4] if ".gz" in target else target
if query == target:
if query not in normalisation:
normalisation[query] = bitscore
# Read input file
with open(fin) as fs_in:
for line in fs_in:
line = line.strip().split()
query, target, evalue, bitscore = line[0], line[1], float(line[-2]), int(line[-1])
# Extract query and target IDs from headers
query = query.split("-")[1] if "-" in query else query
query = query.split(".gz")[0][:-4] if ".gz" in query else query
target = target.split("-")[1] if "-" in target else target
target = target.split(".gz")[0][:-4] if ".gz" in target else target
if query != target:
if bitscore > bits_thr and evalue < eval_thr:
bitscore = round(bitscore / normalisation.get(query, 1), 3)
try:
if (target, evalue, bitscore) not in queries[query]:
queries[query].append((target, evalue, bitscore))
except KeyError:
queries[query] = [(target, evalue, bitscore)]
return queries
if __name__ == "__main__":
# Parsing the input files
all_queries = parse_fs(args.input, args.input_normalisation, args.eval_thr, args.bits_thr)
reciprocal_queries = parse_fs(args.input_bh, args.input_normalisation_bh, args.eval_thr, args.bits_thr)
print(f"Found significant hits for {len(all_queries)} out of {args.proteome_size} proteins in the target organism")
# Saving the results to JSON files
with open(f"{args.input[:-3]}.json", "w") as fall, open(f"{args.input_bh[:-3]}.json", "w") as freci:
json.dump(all_queries, fall)
json.dump(reciprocal_queries, freci)
print(f"Results have been written to {args.input[:-3]}.json and {args.input_bh[:-3]}.json")