Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
lucajovine authored Oct 25, 2023
1 parent 47478f6 commit 9079572
Showing 1 changed file with 197 additions and 0 deletions.
197 changes: 197 additions & 0 deletions stride_consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env python3

HELP = """
stride_consensus: calculate a consensus STRIDE secondary structure profile
--------------------------------------------------------------------------
Luca Jovine, Karolinska Institutet ([email protected])
Version 1.3.2 - 231025
This python script takes as single argument an output file generated by the secondary structure assignment program
STRIDE (PMID 15215436 ; https://webclu.bio.wzw.tum.de/stride), compares the secondary structure assignments for
multiple copies of the same protein and generates a combined sequence and average secondary structure profile.
If the starting set of coordinates contains information for different proteins, the corresponding ATOM lines should
be separated into multiple files before running STRIDE and then analyzing its output with stride_consensus.
The secondary structure codes (SSCs) output by the program match those of STRIDE:
H = Alpha helix
G = 3-10 helix
I = PI-helix
E = Extended conformation
B (or b) = Isolated bridge
T = Turn
C = Coil
In addition, the consensus secondary structure profile generated by the program will contain S or l (lowercase L) SSCs
for residues whose assignments are tied between H, G, I, E or B, T, C, respectively.
"""

import sys
from collections import defaultdict

def generate_consensus_structure(chain_secondary_structure):
from collections import Counter
consensus_structure = []

for i in range(len(next(iter(chain_secondary_structure.values())))):
ssc_at_position = [structure[i] for structure in chain_secondary_structure.values() if structure[i] != '-']
ssc_count = Counter(ssc_at_position)

if len(ssc_count) == 0:
consensus_structure.append('-')
else:
max_count = max(ssc_count.values())
most_common_ssc = [ssc for ssc, count in ssc_count.items() if count == max_count]

if len(most_common_ssc) == 1:
consensus_structure.append(most_common_ssc[0])
else:
has_upper = any(ssc in ['H', 'G', 'I', 'E'] for ssc in most_common_ssc)
has_lower = any(ssc in ['b', 't', 'c'] for ssc in most_common_ssc)

if has_upper and not has_lower:
if len(set(most_common_ssc).intersection(set(['H', 'G', 'I', 'E']))) > 1:
consensus_structure.append('S')
else:
consensus_structure.append(most_common_ssc[0])
elif has_lower and not has_upper:
if len(set(most_common_ssc).intersection(set(['b', 't', 'c']))) > 1:
consensus_structure.append('l')
else:
consensus_structure.append(most_common_ssc[0])
elif has_upper and has_lower:
upper_ssc = [ssc for ssc in most_common_ssc if ssc in ['H', 'G', 'I', 'E']]
upper_count = Counter(upper_ssc)
max_upper_count = max(upper_count.values())
most_common_upper_ssc = [ssc for ssc, count in upper_count.items() if count == max_upper_count]

if len(most_common_upper_ssc) == 1:
consensus_structure.append(most_common_upper_ssc[0])
else:
consensus_structure.append('S')
else:
consensus_structure.append(most_common_ssc[0])

return ''.join(consensus_structure)

def generate_consensus(chain_sequences):
consensus_sequence = []
for i in range(len(next(iter(chain_sequences.values())))):
aa_set = set()
for sequence in chain_sequences.values():
aa = sequence[i]
if aa != '-':
aa_set.add(aa)
if len(aa_set) == 0:
consensus_sequence.append('-')
elif len(aa_set) == 1:
consensus_sequence.append(next(iter(aa_set)))
else:
consensus_sequence.append('X') # Placeholder for ambiguous residues

return ''.join(consensus_sequence)

def main():
if len(sys.argv) != 2 or (len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}):
if len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}:
print(HELP)
sys.exit(1)
print("\n Syntax: stride_consensus.py [-h] <stride_output_file>\n")
sys.exit(1)

input_file_path = sys.argv[1]

try:
with open(input_file_path, 'r') as f:
pass
except FileNotFoundError:
print(f'\n ERROR: file "{input_file_path}" not found\n')
sys.exit(1)

output_file_path = input_file_path.split('.')[0] + "_consensus.txt"

asg_lines = []
with open(input_file_path, 'r') as f:
for line in f:
if line.startswith("ASG"):
asg_lines.append(line.strip())

if not asg_lines:
print(f'\n ERROR: No STRIDE ASG lines found in input file "{input_file_path}"\n')
sys.exit(1)

last_aa = max(int(line.split()[3]) for line in asg_lines)

chain_sequences = defaultdict(str)
chain_secondary_structure_1char = defaultdict(str)

for line in asg_lines:
fields = line.split()
aa_type = fields[1]
sec_str_code_1char = fields[5]
chain_id = fields[2]
res_num = int(fields[3])

aa_type_1_letter = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}.get(aa_type, 'X')

while len(chain_sequences[chain_id]) < res_num - 1:
chain_sequences[chain_id] += '-'
chain_secondary_structure_1char[chain_id] += '-'
chain_sequences[chain_id] += aa_type_1_letter
if sec_str_code_1char in ['B', 'T', 'C']:
chain_secondary_structure_1char[chain_id] += sec_str_code_1char.lower()
else:
chain_secondary_structure_1char[chain_id] += sec_str_code_1char

# Remove gap-only columns at the beginning of the sequences
min_gap_index = min(next((i for i, c in enumerate(seq) if c != "-"), len(seq)) for seq in chain_sequences.values())
chain_sequences = {k: v[min_gap_index:] for k, v in chain_sequences.items()}
chain_secondary_structure_1char = {k: v[min_gap_index:] for k, v in chain_secondary_structure_1char.items()}
last_aa -= min_gap_index

for chain_id in chain_sequences.keys():
chain_sequences[chain_id] = chain_sequences[chain_id].ljust(last_aa, '-')
chain_secondary_structure_1char[chain_id] = chain_secondary_structure_1char[chain_id].ljust(last_aa, '-')

residue_bar_lines = [
' ' * 15 + ''.join(str(i // 100) for i in range(1, last_aa + 1)),
' ' * 15 + ''.join(str((i % 100) // 10) for i in range(1, last_aa + 1)),
' ' * 15 + ''.join(str(i % 10) for i in range(1, last_aa + 1))
]

# Adjust the residue numbering bars to match the trimmed sequences
residue_bar_lines = [
" " * 15 + "".join(str((i + min_gap_index + 1) // 100) for i in range(last_aa)),
" " * 15 + "".join(str(((i + min_gap_index + 1) % 100) // 10) for i in range(last_aa)),
" " * 15 + "".join(str((i + min_gap_index + 1) % 10) for i in range(last_aa))
]

output_content = ""
for bar_line in residue_bar_lines:
output_content += f"{bar_line}\n"
for chain_id, sequence in chain_sequences.items():
output_content += f"\nChain {chain_id} seq: {sequence}\nChain {chain_id} str: {chain_secondary_structure_1char[chain_id]}\n"

consensus_sequence = generate_consensus(chain_sequences)

consensus_structure = generate_consensus_structure(chain_secondary_structure_1char)

output_content += f"\nCombined seq: {consensus_sequence}\n"

consensus_structure = generate_consensus_structure(chain_secondary_structure_1char)
consensus_structure_minimal = ''.join('_' if c.islower() else c for c in consensus_structure)

output_content += f"Consensus str: {consensus_structure}\nConsensus min: {consensus_structure_minimal}\n"

with open(output_file_path, 'w') as f:
f.write(output_content)

print(output_content.replace('\\n', '\n'))

if __name__ == "__main__":
main()

0 comments on commit 9079572

Please sign in to comment.