From 9079572bb8ed1bbc1c836ba39ded15841ebffd63 Mon Sep 17 00:00:00 2001
From: Luca Jovine <lucajovine@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:56:05 +0200
Subject: [PATCH] Add files via upload

---
 stride_consensus.py | 197 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 stride_consensus.py

diff --git a/stride_consensus.py b/stride_consensus.py
new file mode 100644
index 0000000..fe069ef
--- /dev/null
+++ b/stride_consensus.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+
+HELP = """
+ stride_consensus: calculate a consensus STRIDE secondary structure profile
+ --------------------------------------------------------------------------
+ Luca Jovine, Karolinska Institutet (luca.jovine@ki.se)
+ Version 1.3.2 - 231025
+ 
+ This python script takes as single argument an output file generated by the secondary structure assignment program
+ STRIDE (PMID 15215436 ; https://webclu.bio.wzw.tum.de/stride), compares the secondary structure assignments for
+ multiple copies of the same protein and generates a combined sequence and average secondary structure profile.
+ If the starting set of coordinates contains information for different proteins, the corresponding ATOM lines should
+ be separated into multiple files before running STRIDE and then analyzing its output with stride_consensus.
+ 
+ The secondary structure codes (SSCs) output by the program match those of STRIDE:
+ 
+ H = Alpha helix
+ G = 3-10 helix
+ I = PI-helix
+ E = Extended conformation
+ B (or b) = Isolated bridge
+ T = Turn
+ C = Coil
+ 
+ In addition, the consensus secondary structure profile generated by the program will contain S or l (lowercase L) SSCs
+ for residues whose assignments are tied between H, G, I, E or B, T, C, respectively.
+"""
+
+import sys
+from collections import defaultdict
+
+def generate_consensus_structure(chain_secondary_structure):
+    from collections import Counter
+    consensus_structure = []
+
+    for i in range(len(next(iter(chain_secondary_structure.values())))):
+        ssc_at_position = [structure[i] for structure in chain_secondary_structure.values() if structure[i] != '-']
+        ssc_count = Counter(ssc_at_position)
+
+        if len(ssc_count) == 0:
+            consensus_structure.append('-')
+        else:
+            max_count = max(ssc_count.values())
+            most_common_ssc = [ssc for ssc, count in ssc_count.items() if count == max_count]
+
+            if len(most_common_ssc) == 1:
+                consensus_structure.append(most_common_ssc[0])
+            else:
+                has_upper = any(ssc in ['H', 'G', 'I', 'E'] for ssc in most_common_ssc)
+                has_lower = any(ssc in ['b', 't', 'c'] for ssc in most_common_ssc)
+                
+                if has_upper and not has_lower:
+                    if len(set(most_common_ssc).intersection(set(['H', 'G', 'I', 'E']))) > 1:
+                        consensus_structure.append('S')
+                    else:
+                        consensus_structure.append(most_common_ssc[0])
+                elif has_lower and not has_upper:
+                    if len(set(most_common_ssc).intersection(set(['b', 't', 'c']))) > 1:
+                        consensus_structure.append('l')
+                    else:
+                        consensus_structure.append(most_common_ssc[0])
+                elif has_upper and has_lower:
+                    upper_ssc = [ssc for ssc in most_common_ssc if ssc in ['H', 'G', 'I', 'E']]
+                    upper_count = Counter(upper_ssc)
+                    max_upper_count = max(upper_count.values())
+                    most_common_upper_ssc = [ssc for ssc, count in upper_count.items() if count == max_upper_count]
+
+                    if len(most_common_upper_ssc) == 1:
+                        consensus_structure.append(most_common_upper_ssc[0])
+                    else:
+                        consensus_structure.append('S')
+                else:
+                    consensus_structure.append(most_common_ssc[0])
+
+    return ''.join(consensus_structure)
+
+def generate_consensus(chain_sequences):
+    consensus_sequence = []
+    for i in range(len(next(iter(chain_sequences.values())))):
+        aa_set = set()
+        for sequence in chain_sequences.values():
+            aa = sequence[i]
+            if aa != '-':
+                aa_set.add(aa)
+        if len(aa_set) == 0:
+            consensus_sequence.append('-')
+        elif len(aa_set) == 1:
+            consensus_sequence.append(next(iter(aa_set)))
+        else:
+            consensus_sequence.append('X')  # Placeholder for ambiguous residues
+
+    return ''.join(consensus_sequence)
+
+def main():
+    if len(sys.argv) != 2 or (len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}):
+        if len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}:
+            print(HELP)
+            sys.exit(1)
+        print("\n Syntax: stride_consensus.py [-h] <stride_output_file>\n")
+        sys.exit(1)
+
+    input_file_path = sys.argv[1]
+
+    try:
+        with open(input_file_path, 'r') as f:
+            pass
+    except FileNotFoundError:
+        print(f'\n ERROR: file "{input_file_path}" not found\n')
+        sys.exit(1)
+
+    output_file_path = input_file_path.split('.')[0] + "_consensus.txt"
+
+    asg_lines = []
+    with open(input_file_path, 'r') as f:
+        for line in f:
+            if line.startswith("ASG"):
+                asg_lines.append(line.strip())
+                
+    if not asg_lines:
+        print(f'\n ERROR: No STRIDE ASG lines found in input file "{input_file_path}"\n')
+        sys.exit(1)
+
+    last_aa = max(int(line.split()[3]) for line in asg_lines)
+    
+    chain_sequences = defaultdict(str)
+    chain_secondary_structure_1char = defaultdict(str)
+
+    for line in asg_lines:
+        fields = line.split()
+        aa_type = fields[1]
+        sec_str_code_1char = fields[5]
+        chain_id = fields[2]
+        res_num = int(fields[3])
+        
+        aa_type_1_letter = {
+            'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
+            'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
+            'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
+            'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
+        }.get(aa_type, 'X')
+
+        while len(chain_sequences[chain_id]) < res_num - 1:
+            chain_sequences[chain_id] += '-'
+            chain_secondary_structure_1char[chain_id] += '-'
+        chain_sequences[chain_id] += aa_type_1_letter
+        if sec_str_code_1char in ['B', 'T', 'C']:
+            chain_secondary_structure_1char[chain_id] += sec_str_code_1char.lower()
+        else:
+            chain_secondary_structure_1char[chain_id] += sec_str_code_1char
+
+    # Remove gap-only columns at the beginning of the sequences
+    min_gap_index = min(next((i for i, c in enumerate(seq) if c != "-"), len(seq)) for seq in chain_sequences.values())
+    chain_sequences = {k: v[min_gap_index:] for k, v in chain_sequences.items()}
+    chain_secondary_structure_1char = {k: v[min_gap_index:] for k, v in chain_secondary_structure_1char.items()}
+    last_aa -= min_gap_index
+
+    for chain_id in chain_sequences.keys():
+        chain_sequences[chain_id] = chain_sequences[chain_id].ljust(last_aa, '-')
+        chain_secondary_structure_1char[chain_id] = chain_secondary_structure_1char[chain_id].ljust(last_aa, '-')
+
+    residue_bar_lines = [
+        ' ' * 15 + ''.join(str(i // 100) for i in range(1, last_aa + 1)),
+        ' ' * 15 + ''.join(str((i % 100) // 10) for i in range(1, last_aa + 1)),
+        ' ' * 15 + ''.join(str(i % 10) for i in range(1, last_aa + 1))
+    ]
+
+    # Adjust the residue numbering bars to match the trimmed sequences
+    residue_bar_lines = [
+        " " * 15 + "".join(str((i + min_gap_index + 1) // 100) for i in range(last_aa)),
+        " " * 15 + "".join(str(((i + min_gap_index + 1) % 100) // 10) for i in range(last_aa)),
+        " " * 15 + "".join(str((i + min_gap_index + 1) % 10) for i in range(last_aa))
+    ]
+
+    output_content = ""
+    for bar_line in residue_bar_lines:
+        output_content += f"{bar_line}\n"
+    for chain_id, sequence in chain_sequences.items():
+        output_content += f"\nChain {chain_id} seq:   {sequence}\nChain {chain_id} str:   {chain_secondary_structure_1char[chain_id]}\n"
+
+    consensus_sequence = generate_consensus(chain_sequences)
+       
+    consensus_structure = generate_consensus_structure(chain_secondary_structure_1char)
+        
+    output_content += f"\nCombined seq:  {consensus_sequence}\n"
+    
+    consensus_structure = generate_consensus_structure(chain_secondary_structure_1char)
+    consensus_structure_minimal = ''.join('_' if c.islower() else c for c in consensus_structure)
+    
+    output_content += f"Consensus str: {consensus_structure}\nConsensus min: {consensus_structure_minimal}\n"
+
+    with open(output_file_path, 'w') as f:
+        f.write(output_content)
+
+    print(output_content.replace('\\n', '\n'))
+
+if __name__ == "__main__":
+    main()