From 9079572bb8ed1bbc1c836ba39ded15841ebffd63 Mon Sep 17 00:00:00 2001 From: Luca Jovine Date: Wed, 25 Oct 2023 15:56:05 +0200 Subject: [PATCH] Add files via upload --- stride_consensus.py | 197 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 stride_consensus.py diff --git a/stride_consensus.py b/stride_consensus.py new file mode 100644 index 0000000..fe069ef --- /dev/null +++ b/stride_consensus.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +HELP = """ + stride_consensus: calculate a consensus STRIDE secondary structure profile + -------------------------------------------------------------------------- + Luca Jovine, Karolinska Institutet (luca.jovine@ki.se) + Version 1.3.2 - 231025 + + This python script takes as single argument an output file generated by the secondary structure assignment program + STRIDE (PMID 15215436 ; https://webclu.bio.wzw.tum.de/stride), compares the secondary structure assignments for + multiple copies of the same protein and generates a combined sequence and average secondary structure profile. + If the starting set of coordinates contains information for different proteins, the corresponding ATOM lines should + be separated into multiple files before running STRIDE and then analyzing its output with stride_consensus. + + The secondary structure codes (SSCs) output by the program match those of STRIDE: + + H = Alpha helix + G = 3-10 helix + I = PI-helix + E = Extended conformation + B (or b) = Isolated bridge + T = Turn + C = Coil + + In addition, the consensus secondary structure profile generated by the program will contain S or l (lowercase L) SSCs + for residues whose assignments are tied between H, G, I, E or B, T, C, respectively. +""" + +import sys +from collections import defaultdict + +def generate_consensus_structure(chain_secondary_structure): + from collections import Counter + consensus_structure = [] + + for i in range(len(next(iter(chain_secondary_structure.values())))): + ssc_at_position = [structure[i] for structure in chain_secondary_structure.values() if structure[i] != '-'] + ssc_count = Counter(ssc_at_position) + + if len(ssc_count) == 0: + consensus_structure.append('-') + else: + max_count = max(ssc_count.values()) + most_common_ssc = [ssc for ssc, count in ssc_count.items() if count == max_count] + + if len(most_common_ssc) == 1: + consensus_structure.append(most_common_ssc[0]) + else: + has_upper = any(ssc in ['H', 'G', 'I', 'E'] for ssc in most_common_ssc) + has_lower = any(ssc in ['b', 't', 'c'] for ssc in most_common_ssc) + + if has_upper and not has_lower: + if len(set(most_common_ssc).intersection(set(['H', 'G', 'I', 'E']))) > 1: + consensus_structure.append('S') + else: + consensus_structure.append(most_common_ssc[0]) + elif has_lower and not has_upper: + if len(set(most_common_ssc).intersection(set(['b', 't', 'c']))) > 1: + consensus_structure.append('l') + else: + consensus_structure.append(most_common_ssc[0]) + elif has_upper and has_lower: + upper_ssc = [ssc for ssc in most_common_ssc if ssc in ['H', 'G', 'I', 'E']] + upper_count = Counter(upper_ssc) + max_upper_count = max(upper_count.values()) + most_common_upper_ssc = [ssc for ssc, count in upper_count.items() if count == max_upper_count] + + if len(most_common_upper_ssc) == 1: + consensus_structure.append(most_common_upper_ssc[0]) + else: + consensus_structure.append('S') + else: + consensus_structure.append(most_common_ssc[0]) + + return ''.join(consensus_structure) + +def generate_consensus(chain_sequences): + consensus_sequence = [] + for i in range(len(next(iter(chain_sequences.values())))): + aa_set = set() + for sequence in chain_sequences.values(): + aa = sequence[i] + if aa != '-': + aa_set.add(aa) + if len(aa_set) == 0: + consensus_sequence.append('-') + elif len(aa_set) == 1: + consensus_sequence.append(next(iter(aa_set))) + else: + consensus_sequence.append('X') # Placeholder for ambiguous residues + + return ''.join(consensus_sequence) + +def main(): + if len(sys.argv) != 2 or (len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}): + if len(sys.argv) == 2 and sys.argv[1] in {'-h', '-help', '--help'}: + print(HELP) + sys.exit(1) + print("\n Syntax: stride_consensus.py [-h] \n") + sys.exit(1) + + input_file_path = sys.argv[1] + + try: + with open(input_file_path, 'r') as f: + pass + except FileNotFoundError: + print(f'\n ERROR: file "{input_file_path}" not found\n') + sys.exit(1) + + output_file_path = input_file_path.split('.')[0] + "_consensus.txt" + + asg_lines = [] + with open(input_file_path, 'r') as f: + for line in f: + if line.startswith("ASG"): + asg_lines.append(line.strip()) + + if not asg_lines: + print(f'\n ERROR: No STRIDE ASG lines found in input file "{input_file_path}"\n') + sys.exit(1) + + last_aa = max(int(line.split()[3]) for line in asg_lines) + + chain_sequences = defaultdict(str) + chain_secondary_structure_1char = defaultdict(str) + + for line in asg_lines: + fields = line.split() + aa_type = fields[1] + sec_str_code_1char = fields[5] + chain_id = fields[2] + res_num = int(fields[3]) + + aa_type_1_letter = { + 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', + 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', + 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', + 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V' + }.get(aa_type, 'X') + + while len(chain_sequences[chain_id]) < res_num - 1: + chain_sequences[chain_id] += '-' + chain_secondary_structure_1char[chain_id] += '-' + chain_sequences[chain_id] += aa_type_1_letter + if sec_str_code_1char in ['B', 'T', 'C']: + chain_secondary_structure_1char[chain_id] += sec_str_code_1char.lower() + else: + chain_secondary_structure_1char[chain_id] += sec_str_code_1char + + # Remove gap-only columns at the beginning of the sequences + min_gap_index = min(next((i for i, c in enumerate(seq) if c != "-"), len(seq)) for seq in chain_sequences.values()) + chain_sequences = {k: v[min_gap_index:] for k, v in chain_sequences.items()} + chain_secondary_structure_1char = {k: v[min_gap_index:] for k, v in chain_secondary_structure_1char.items()} + last_aa -= min_gap_index + + for chain_id in chain_sequences.keys(): + chain_sequences[chain_id] = chain_sequences[chain_id].ljust(last_aa, '-') + chain_secondary_structure_1char[chain_id] = chain_secondary_structure_1char[chain_id].ljust(last_aa, '-') + + residue_bar_lines = [ + ' ' * 15 + ''.join(str(i // 100) for i in range(1, last_aa + 1)), + ' ' * 15 + ''.join(str((i % 100) // 10) for i in range(1, last_aa + 1)), + ' ' * 15 + ''.join(str(i % 10) for i in range(1, last_aa + 1)) + ] + + # Adjust the residue numbering bars to match the trimmed sequences + residue_bar_lines = [ + " " * 15 + "".join(str((i + min_gap_index + 1) // 100) for i in range(last_aa)), + " " * 15 + "".join(str(((i + min_gap_index + 1) % 100) // 10) for i in range(last_aa)), + " " * 15 + "".join(str((i + min_gap_index + 1) % 10) for i in range(last_aa)) + ] + + output_content = "" + for bar_line in residue_bar_lines: + output_content += f"{bar_line}\n" + for chain_id, sequence in chain_sequences.items(): + output_content += f"\nChain {chain_id} seq: {sequence}\nChain {chain_id} str: {chain_secondary_structure_1char[chain_id]}\n" + + consensus_sequence = generate_consensus(chain_sequences) + + consensus_structure = generate_consensus_structure(chain_secondary_structure_1char) + + output_content += f"\nCombined seq: {consensus_sequence}\n" + + consensus_structure = generate_consensus_structure(chain_secondary_structure_1char) + consensus_structure_minimal = ''.join('_' if c.islower() else c for c in consensus_structure) + + output_content += f"Consensus str: {consensus_structure}\nConsensus min: {consensus_structure_minimal}\n" + + with open(output_file_path, 'w') as f: + f.write(output_content) + + print(output_content.replace('\\n', '\n')) + +if __name__ == "__main__": + main()