-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
47478f6
commit 9079572
Showing
1 changed file
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
#!/usr/bin/env python3 | ||
|
||
HELP = """ | ||
stride_consensus: calculate a consensus STRIDE secondary structure profile | ||
-------------------------------------------------------------------------- | ||
Luca Jovine, Karolinska Institutet ([email protected]) | ||
Version 1.3.2 - 231025 | ||
This python script takes as single argument an output file generated by the secondary structure assignment program | ||
STRIDE (PMID 15215436 ; https://webclu.bio.wzw.tum.de/stride), compares the secondary structure assignments for | ||
multiple copies of the same protein and generates a combined sequence and average secondary structure profile. | ||
If the starting set of coordinates contains information for different proteins, the corresponding ATOM lines should | ||
be separated into multiple files before running STRIDE and then analyzing its output with stride_consensus. | ||
The secondary structure codes (SSCs) output by the program match those of STRIDE: | ||
H = Alpha helix | ||
G = 3-10 helix | ||
I = PI-helix | ||
E = Extended conformation | ||
B (or b) = Isolated bridge | ||
T = Turn | ||
C = Coil | ||
In addition, the consensus secondary structure profile generated by the program will contain S or l (lowercase L) SSCs | ||
for residues whose assignments are tied between H, G, I, E or B, T, C, respectively. | ||
""" | ||
|
||
import sys | ||
from collections import defaultdict | ||
|
||
def generate_consensus_structure(chain_secondary_structure): | ||
from collections import Counter | ||
consensus_structure = [] | ||
|
||
for i in range(len(next(iter(chain_secondary_structure.values())))): | ||
ssc_at_position = [structure[i] for structure in chain_secondary_structure.values() if structure[i] != '-'] | ||
ssc_count = Counter(ssc_at_position) | ||
|
||
if len(ssc_count) == 0: | ||
consensus_structure.append('-') | ||
else: | ||
max_count = max(ssc_count.values()) | ||
most_common_ssc = [ssc for ssc, count in ssc_count.items() if count == max_count] | ||
|
||
if len(most_common_ssc) == 1: | ||
consensus_structure.append(most_common_ssc[0]) | ||
else: | ||
has_upper = any(ssc in ['H', 'G', 'I', 'E'] for ssc in most_common_ssc) | ||
has_lower = any(ssc in ['b', 't', 'c'] for ssc in most_common_ssc) | ||
|
||
if has_upper and not has_lower: | ||
if len(set(most_common_ssc).intersection(set(['H', 'G', 'I', 'E']))) > 1: | ||
consensus_structure.append('S') | ||
else: | ||
consensus_structure.append(most_common_ssc[0]) | ||
elif has_lower and not has_upper: | ||
if len(set(most_common_ssc).intersection(set(['b', 't', 'c']))) > 1: | ||
consensus_structure.append('l') | ||
else: | ||
consensus_structure.append(most_common_ssc[0]) | ||
elif has_upper and has_lower: | ||
upper_ssc = [ssc for ssc in most_common_ssc if ssc in ['H', 'G', 'I', 'E']] | ||
upper_count = Counter(upper_ssc) | ||
max_upper_count = max(upper_count.values()) | ||
most_common_upper_ssc = [ssc for ssc, count in upper_count.items() if count == max_upper_count] | ||
|
||
if len(most_common_upper_ssc) == 1: | ||
consensus_structure.append(most_common_upper_ssc[0]) | ||
else: | ||
consensus_structure.append('S') | ||
else: | ||
consensus_structure.append(most_common_ssc[0]) | ||
|
||
return ''.join(consensus_structure) | ||
|
||
def generate_consensus(chain_sequences): | ||
consensus_sequence = [] | ||
for i in range(len(next(iter(chain_sequences.values())))): | ||
aa_set = set() | ||
for sequence in chain_sequences.values(): | ||
aa = sequence[i] | ||
if aa != '-': | ||
aa_set.add(aa) | ||
if len(aa_set) == 0: | ||
consensus_sequence.append('-') | ||
elif len(aa_set) == 1: | ||
consensus_sequence.append(next(iter(aa_set))) | ||
else: | ||
consensus_sequence.append('X') # Placeholder for ambiguous residues | ||
|
||
return ''.join(consensus_sequence) | ||
|
||
def main(): | ||
if len(sys.argv) != 2 or (len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}): | ||
if len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}: | ||
print(HELP) | ||
sys.exit(1) | ||
print("\n Syntax: stride_consensus.py [-h] <stride_output_file>\n") | ||
sys.exit(1) | ||
|
||
input_file_path = sys.argv[1] | ||
|
||
try: | ||
with open(input_file_path, 'r') as f: | ||
pass | ||
except FileNotFoundError: | ||
print(f'\n ERROR: file "{input_file_path}" not found\n') | ||
sys.exit(1) | ||
|
||
output_file_path = input_file_path.split('.')[0] + "_consensus.txt" | ||
|
||
asg_lines = [] | ||
with open(input_file_path, 'r') as f: | ||
for line in f: | ||
if line.startswith("ASG"): | ||
asg_lines.append(line.strip()) | ||
|
||
if not asg_lines: | ||
print(f'\n ERROR: No STRIDE ASG lines found in input file "{input_file_path}"\n') | ||
sys.exit(1) | ||
|
||
last_aa = max(int(line.split()[3]) for line in asg_lines) | ||
|
||
chain_sequences = defaultdict(str) | ||
chain_secondary_structure_1char = defaultdict(str) | ||
|
||
for line in asg_lines: | ||
fields = line.split() | ||
aa_type = fields[1] | ||
sec_str_code_1char = fields[5] | ||
chain_id = fields[2] | ||
res_num = int(fields[3]) | ||
|
||
aa_type_1_letter = { | ||
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', | ||
'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', | ||
'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', | ||
'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V' | ||
}.get(aa_type, 'X') | ||
|
||
while len(chain_sequences[chain_id]) < res_num - 1: | ||
chain_sequences[chain_id] += '-' | ||
chain_secondary_structure_1char[chain_id] += '-' | ||
chain_sequences[chain_id] += aa_type_1_letter | ||
if sec_str_code_1char in ['B', 'T', 'C']: | ||
chain_secondary_structure_1char[chain_id] += sec_str_code_1char.lower() | ||
else: | ||
chain_secondary_structure_1char[chain_id] += sec_str_code_1char | ||
|
||
# Remove gap-only columns at the beginning of the sequences | ||
min_gap_index = min(next((i for i, c in enumerate(seq) if c != "-"), len(seq)) for seq in chain_sequences.values()) | ||
chain_sequences = {k: v[min_gap_index:] for k, v in chain_sequences.items()} | ||
chain_secondary_structure_1char = {k: v[min_gap_index:] for k, v in chain_secondary_structure_1char.items()} | ||
last_aa -= min_gap_index | ||
|
||
for chain_id in chain_sequences.keys(): | ||
chain_sequences[chain_id] = chain_sequences[chain_id].ljust(last_aa, '-') | ||
chain_secondary_structure_1char[chain_id] = chain_secondary_structure_1char[chain_id].ljust(last_aa, '-') | ||
|
||
residue_bar_lines = [ | ||
' ' * 15 + ''.join(str(i // 100) for i in range(1, last_aa + 1)), | ||
' ' * 15 + ''.join(str((i % 100) // 10) for i in range(1, last_aa + 1)), | ||
' ' * 15 + ''.join(str(i % 10) for i in range(1, last_aa + 1)) | ||
] | ||
|
||
# Adjust the residue numbering bars to match the trimmed sequences | ||
residue_bar_lines = [ | ||
" " * 15 + "".join(str((i + min_gap_index + 1) // 100) for i in range(last_aa)), | ||
" " * 15 + "".join(str(((i + min_gap_index + 1) % 100) // 10) for i in range(last_aa)), | ||
" " * 15 + "".join(str((i + min_gap_index + 1) % 10) for i in range(last_aa)) | ||
] | ||
|
||
output_content = "" | ||
for bar_line in residue_bar_lines: | ||
output_content += f"{bar_line}\n" | ||
for chain_id, sequence in chain_sequences.items(): | ||
output_content += f"\nChain {chain_id} seq: {sequence}\nChain {chain_id} str: {chain_secondary_structure_1char[chain_id]}\n" | ||
|
||
consensus_sequence = generate_consensus(chain_sequences) | ||
|
||
consensus_structure = generate_consensus_structure(chain_secondary_structure_1char) | ||
|
||
output_content += f"\nCombined seq: {consensus_sequence}\n" | ||
|
||
consensus_structure = generate_consensus_structure(chain_secondary_structure_1char) | ||
consensus_structure_minimal = ''.join('_' if c.islower() else c for c in consensus_structure) | ||
|
||
output_content += f"Consensus str: {consensus_structure}\nConsensus min: {consensus_structure_minimal}\n" | ||
|
||
with open(output_file_path, 'w') as f: | ||
f.write(output_content) | ||
|
||
print(output_content.replace('\\n', '\n')) | ||
|
||
if __name__ == "__main__": | ||
main() |