Update gget_elm.py

Change the wording "UniProt ID" to "UniProt Acc" everywhere in the gget code and documentation.
pachterlab · Jan 5, 2024 · 31cc8b7 · 31cc8b7
1 parent 937fbb8
commit 31cc8b7
Showing 1 changed file with 22 additions and 22 deletions.
diff --git a/gget/gget_elm.py b/gget/gget_elm.py
@@ -32,12 +32,12 @@ def motif_in_query(row):
     )
 
 
-def get_elm_instances(UniProtID):
+def get_elm_instances(UniProtAcc):
     """
     Get ELM instances and their information from local ELM tsv files.
 
     Args:
-    - UniProtID   UniProt ID to search for in the accession column of ELM tsv files.
+    - UniProtAcc   UniProt Acc to search for in the accession column of ELM tsv files.
 
     Returns: dataframe combining ELM instances and information (description, functional site...)
     """
@@ -50,7 +50,7 @@ def get_elm_instances(UniProtID):
     # Rename columns
     df_instances_matching = df_instances_matching.rename(
         columns={
-            "Primary_Acc": "Ortholog_UniProt_ID",
+            "Primary_Acc": "Ortholog_UniProt_Acc",
             "Start": "motif_start_in_subject",
             "End": "motif_end_in_subject",
         }
@@ -76,7 +76,7 @@ def seq_workflow(
     diamond_binary,
 ):
     """
-    Alignment of sequence using DIAMOND to get UniProt ID. Use the UniProt ID to construct an ortholog dataframe similar to the UniProt workflow
+    Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow
     except for additional columns for start, end and whether the motif overlaps the subject sequence.
 
     Args:
@@ -115,20 +115,20 @@ def seq_workflow(
             )
 
         else:
-            # Construct df with elm instances from UniProt ID returned from diamond
-            # TODO double check that this gets info if more than one UniProt ID matched
+            # Construct df with elm instances from UniProt Acc returned from diamond
+            # TODO double check that this gets info if more than one UniProt Acc matched
             if verbose:
-                uniprot_ids = [
+                uniprot_accs = [
                     str(id).split("|")[1]
                     for id in df_diamond["subject_accession"].values
                 ]
                 logging.info(
-                    f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt ID..."
+                    f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_accs)}. Retrieving ELMs for each UniProt Accession Numbers..."
                 )
 
-            for i, uniprot_id in enumerate(df_diamond["subject_accession"].values):
-                # print(f"UniProt ID {uniprot_id}")
-                df_elm = get_elm_instances(str(uniprot_id).split("|")[1])
+            for i, uniprot_acc in enumerate(df_diamond["subject_accession"].values):
+                # print(f"UniProt Acc {uniprot_acc}")
+                df_elm = get_elm_instances(str(uniprot_acc).split("|")[1])
                 # missing motifs other than the first one
                 # df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100
                 df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i]
@@ -157,7 +157,7 @@ def regex_match(sequence):
     Compare ELM regex with input sequence and return all matching elms
 
     Args:
-    sequence - user input sequence (can be either amino acid seq or UniProt ID)
+    sequence - user input sequence (can be either amino acid seq or UniProt Acc)
 
     Returns:
     df_final - dataframe containing regex matches
@@ -271,7 +271,7 @@ def elm(
         # If sequence is not a valid amino sequence, raise error
         if not set(sequence) <= amino_acids:
             logging.warning(
-                f"Input amino acid sequence contains invalid characters. If the input is a UniProt ID, please use flag --uniprot (Python: uniprot=True)."
+                f"Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)."
             )
 
     # Build ortholog dataframe
@@ -283,13 +283,13 @@ def elm(
 
         if len(ortho_df) == 0:
             logging.warning(
-                "ORTHO UniProt ID does not match UniProt IDs in the ELM database. Fetching amino acid sequence from UniProt..."
+                "ORTHO UniProt Accession Number does not match UniProt Accession Numberss in the ELM database. Fetching amino acid sequence from UniProt..."
             )
             df_uniprot = get_uniprot_seqs(server=UNIPROT_REST_API, ensembl_ids=sequence)
 
             if len(df_uniprot) > 0:
-                # Only grab sequences where IDs match exactly
-                aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence][
+                # Only grab sequences where Accs match exactly
+                aa_seqs = df_uniprot[df_uniprot["uniprot_acc"] == sequence][
                     "sequence"
                 ].values
 
@@ -328,7 +328,7 @@ def elm(
 
     # Reorder columns of ortholog data frame
     ortho_cols = [
-        "Ortholog_UniProt_ID",
+        "Ortholog_UniProt_Acc",
         "ProteinName",
         "class_accession",
         "ELMIdentifier",
@@ -373,27 +373,27 @@ def elm(
         logging.info(f"REGEX Finding regex motif matches...")
     fetch_aa_failed = False
     if uniprot:
-        # use amino acid sequence associated with UniProt ID to do regex match
+        # use amino acid sequence associated with UniProt Acc to do regex match
 
         # do not fetch sequence again if already done above
         if not "df_uniprot" in locals():
             df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, sequence)
 
         if len(df_uniprot) > 0:
             # Only grab sequences where IDs match exactly
-            sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence][
+            sequences = df_uniprot[df_uniprot["uniprot_acc"] == sequence][
                 "sequence"
             ].values
 
             if len(sequences) == 0:
                 logging.warning(
-                    "REGEX No amino acid sequences found for UniProt ID {sequence} from the UniProt server."
+                    "REGEX No amino acid sequences found for UniProt Acc {sequence} from the UniProt server."
                 )
                 fetch_aa_failed = True
             else:
                 if len(sequences) > 1:
                     logging.warning(
-                        f"REGEX More than one amino acid sequence found for UniProt ID {sequence}. Using best match to find regex motifs."
+                        f"REGEX More than one amino acid sequence found for UniProt Acc {sequence}. Using best match to find regex motifs."
                     )
                 sequence = sequences[0]
 
@@ -403,7 +403,7 @@ def elm(
 
     if len(df_regex_matches) == 0:
         logging.warning(
-            "REGEX No regex matches found for input sequence or UniProt ID."
+            "REGEX No regex matches found for input sequence or UniProt Acc."
         )
 
     # Reorder regex columns