diff --git a/gget/gget_elm.py b/gget/gget_elm.py index 4bba7735..b43a601d 100644 --- a/gget/gget_elm.py +++ b/gget/gget_elm.py @@ -32,12 +32,12 @@ def motif_in_query(row): ) -def get_elm_instances(UniProtID): +def get_elm_instances(UniProtAcc): """ Get ELM instances and their information from local ELM tsv files. Args: - - UniProtID UniProt ID to search for in the accession column of ELM tsv files. + - UniProtAcc UniProt Acc to search for in the accession column of ELM tsv files. Returns: dataframe combining ELM instances and information (description, functional site...) """ @@ -50,7 +50,7 @@ def get_elm_instances(UniProtID): # Rename columns df_instances_matching = df_instances_matching.rename( columns={ - "Primary_Acc": "Ortholog_UniProt_ID", + "Primary_Acc": "Ortholog_UniProt_Acc", "Start": "motif_start_in_subject", "End": "motif_end_in_subject", } @@ -76,7 +76,7 @@ def seq_workflow( diamond_binary, ): """ - Alignment of sequence using DIAMOND to get UniProt ID. Use the UniProt ID to construct an ortholog dataframe similar to the UniProt workflow + Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow except for additional columns for start, end and whether the motif overlaps the subject sequence. Args: @@ -115,20 +115,20 @@ def seq_workflow( ) else: - # Construct df with elm instances from UniProt ID returned from diamond - # TODO double check that this gets info if more than one UniProt ID matched + # Construct df with elm instances from UniProt Acc returned from diamond + # TODO double check that this gets info if more than one UniProt Acc matched if verbose: - uniprot_ids = [ + uniprot_accs = [ str(id).split("|")[1] for id in df_diamond["subject_accession"].values ] logging.info( - f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt ID..." + f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_accs)}. Retrieving ELMs for each UniProt Accession Numbers..." ) - for i, uniprot_id in enumerate(df_diamond["subject_accession"].values): - # print(f"UniProt ID {uniprot_id}") - df_elm = get_elm_instances(str(uniprot_id).split("|")[1]) + for i, uniprot_acc in enumerate(df_diamond["subject_accession"].values): + # print(f"UniProt Acc {uniprot_acc}") + df_elm = get_elm_instances(str(uniprot_acc).split("|")[1]) # missing motifs other than the first one # df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100 df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i] @@ -157,7 +157,7 @@ def regex_match(sequence): Compare ELM regex with input sequence and return all matching elms Args: - sequence - user input sequence (can be either amino acid seq or UniProt ID) + sequence - user input sequence (can be either amino acid seq or UniProt Acc) Returns: df_final - dataframe containing regex matches @@ -271,7 +271,7 @@ def elm( # If sequence is not a valid amino sequence, raise error if not set(sequence) <= amino_acids: logging.warning( - f"Input amino acid sequence contains invalid characters. If the input is a UniProt ID, please use flag --uniprot (Python: uniprot=True)." + f"Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)." ) # Build ortholog dataframe @@ -283,13 +283,13 @@ def elm( if len(ortho_df) == 0: logging.warning( - "ORTHO UniProt ID does not match UniProt IDs in the ELM database. Fetching amino acid sequence from UniProt..." + "ORTHO UniProt Accession Number does not match UniProt Accession Numberss in the ELM database. Fetching amino acid sequence from UniProt..." ) df_uniprot = get_uniprot_seqs(server=UNIPROT_REST_API, ensembl_ids=sequence) if len(df_uniprot) > 0: - # Only grab sequences where IDs match exactly - aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence][ + # Only grab sequences where Accs match exactly + aa_seqs = df_uniprot[df_uniprot["uniprot_acc"] == sequence][ "sequence" ].values @@ -328,7 +328,7 @@ def elm( # Reorder columns of ortholog data frame ortho_cols = [ - "Ortholog_UniProt_ID", + "Ortholog_UniProt_Acc", "ProteinName", "class_accession", "ELMIdentifier", @@ -373,7 +373,7 @@ def elm( logging.info(f"REGEX Finding regex motif matches...") fetch_aa_failed = False if uniprot: - # use amino acid sequence associated with UniProt ID to do regex match + # use amino acid sequence associated with UniProt Acc to do regex match # do not fetch sequence again if already done above if not "df_uniprot" in locals(): @@ -381,19 +381,19 @@ def elm( if len(df_uniprot) > 0: # Only grab sequences where IDs match exactly - sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence][ + sequences = df_uniprot[df_uniprot["uniprot_acc"] == sequence][ "sequence" ].values if len(sequences) == 0: logging.warning( - "REGEX No amino acid sequences found for UniProt ID {sequence} from the UniProt server." + "REGEX No amino acid sequences found for UniProt Acc {sequence} from the UniProt server." ) fetch_aa_failed = True else: if len(sequences) > 1: logging.warning( - f"REGEX More than one amino acid sequence found for UniProt ID {sequence}. Using best match to find regex motifs." + f"REGEX More than one amino acid sequence found for UniProt Acc {sequence}. Using best match to find regex motifs." ) sequence = sequences[0] @@ -403,7 +403,7 @@ def elm( if len(df_regex_matches) == 0: logging.warning( - "REGEX No regex matches found for input sequence or UniProt ID." + "REGEX No regex matches found for input sequence or UniProt Acc." ) # Reorder regex columns