Skip to content

Commit

Permalink
Update gget_elm.py
Browse files Browse the repository at this point in the history
Change the wording "UniProt ID" to "UniProt Acc" everywhere in the gget code and documentation.
  • Loading branch information
choang20 authored Jan 5, 2024
1 parent 937fbb8 commit 31cc8b7
Showing 1 changed file with 22 additions and 22 deletions.
44 changes: 22 additions & 22 deletions gget/gget_elm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ def motif_in_query(row):
)


def get_elm_instances(UniProtID):
def get_elm_instances(UniProtAcc):
"""
Get ELM instances and their information from local ELM tsv files.
Args:
- UniProtID UniProt ID to search for in the accession column of ELM tsv files.
- UniProtAcc UniProt Acc to search for in the accession column of ELM tsv files.
Returns: dataframe combining ELM instances and information (description, functional site...)
"""
Expand All @@ -50,7 +50,7 @@ def get_elm_instances(UniProtID):
# Rename columns
df_instances_matching = df_instances_matching.rename(
columns={
"Primary_Acc": "Ortholog_UniProt_ID",
"Primary_Acc": "Ortholog_UniProt_Acc",
"Start": "motif_start_in_subject",
"End": "motif_end_in_subject",
}
Expand All @@ -76,7 +76,7 @@ def seq_workflow(
diamond_binary,
):
"""
Alignment of sequence using DIAMOND to get UniProt ID. Use the UniProt ID to construct an ortholog dataframe similar to the UniProt workflow
Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow
except for additional columns for start, end and whether the motif overlaps the subject sequence.
Args:
Expand Down Expand Up @@ -115,20 +115,20 @@ def seq_workflow(
)

else:
# Construct df with elm instances from UniProt ID returned from diamond
# TODO double check that this gets info if more than one UniProt ID matched
# Construct df with elm instances from UniProt Acc returned from diamond
# TODO double check that this gets info if more than one UniProt Acc matched
if verbose:
uniprot_ids = [
uniprot_accs = [
str(id).split("|")[1]
for id in df_diamond["subject_accession"].values
]
logging.info(
f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt ID..."
f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_accs)}. Retrieving ELMs for each UniProt Accession Numbers..."
)

for i, uniprot_id in enumerate(df_diamond["subject_accession"].values):
# print(f"UniProt ID {uniprot_id}")
df_elm = get_elm_instances(str(uniprot_id).split("|")[1])
for i, uniprot_acc in enumerate(df_diamond["subject_accession"].values):
# print(f"UniProt Acc {uniprot_acc}")
df_elm = get_elm_instances(str(uniprot_acc).split("|")[1])
# missing motifs other than the first one
# df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100
df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i]
Expand Down Expand Up @@ -157,7 +157,7 @@ def regex_match(sequence):
Compare ELM regex with input sequence and return all matching elms
Args:
sequence - user input sequence (can be either amino acid seq or UniProt ID)
sequence - user input sequence (can be either amino acid seq or UniProt Acc)
Returns:
df_final - dataframe containing regex matches
Expand Down Expand Up @@ -271,7 +271,7 @@ def elm(
# If sequence is not a valid amino sequence, raise error
if not set(sequence) <= amino_acids:
logging.warning(
f"Input amino acid sequence contains invalid characters. If the input is a UniProt ID, please use flag --uniprot (Python: uniprot=True)."
f"Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)."
)

# Build ortholog dataframe
Expand All @@ -283,13 +283,13 @@ def elm(

if len(ortho_df) == 0:
logging.warning(
"ORTHO UniProt ID does not match UniProt IDs in the ELM database. Fetching amino acid sequence from UniProt..."
"ORTHO UniProt Accession Number does not match UniProt Accession Numberss in the ELM database. Fetching amino acid sequence from UniProt..."
)
df_uniprot = get_uniprot_seqs(server=UNIPROT_REST_API, ensembl_ids=sequence)

if len(df_uniprot) > 0:
# Only grab sequences where IDs match exactly
aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence][
# Only grab sequences where Accs match exactly
aa_seqs = df_uniprot[df_uniprot["uniprot_acc"] == sequence][
"sequence"
].values

Expand Down Expand Up @@ -328,7 +328,7 @@ def elm(

# Reorder columns of ortholog data frame
ortho_cols = [
"Ortholog_UniProt_ID",
"Ortholog_UniProt_Acc",
"ProteinName",
"class_accession",
"ELMIdentifier",
Expand Down Expand Up @@ -373,27 +373,27 @@ def elm(
logging.info(f"REGEX Finding regex motif matches...")
fetch_aa_failed = False
if uniprot:
# use amino acid sequence associated with UniProt ID to do regex match
# use amino acid sequence associated with UniProt Acc to do regex match

# do not fetch sequence again if already done above
if not "df_uniprot" in locals():
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, sequence)

if len(df_uniprot) > 0:
# Only grab sequences where IDs match exactly
sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence][
sequences = df_uniprot[df_uniprot["uniprot_acc"] == sequence][
"sequence"
].values

if len(sequences) == 0:
logging.warning(
"REGEX No amino acid sequences found for UniProt ID {sequence} from the UniProt server."
"REGEX No amino acid sequences found for UniProt Acc {sequence} from the UniProt server."
)
fetch_aa_failed = True
else:
if len(sequences) > 1:
logging.warning(
f"REGEX More than one amino acid sequence found for UniProt ID {sequence}. Using best match to find regex motifs."
f"REGEX More than one amino acid sequence found for UniProt Acc {sequence}. Using best match to find regex motifs."
)
sequence = sequences[0]

Expand All @@ -403,7 +403,7 @@ def elm(

if len(df_regex_matches) == 0:
logging.warning(
"REGEX No regex matches found for input sequence or UniProt ID."
"REGEX No regex matches found for input sequence or UniProt Acc."
)

# Reorder regex columns
Expand Down

0 comments on commit 31cc8b7

Please sign in to comment.