Skip to content

Add Epitope data from TDC #96

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions data/iedb_jespersen_et_al/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
name: iedb_jespersen_et_al
description: |-
Epitope prediction is to predict the active region in the antigen.
This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
epitopes and non-epitope amino acids determined from crystal structures.
targets:
- id: active_position
description: amino acids sequence position that is active in binding
units:
type: categorical
names:
- noun: epitope
- noun: amino acids sequence active in the antigen binding
- noun: epitope sequence active in the antigen binding
- noun: epitope sequence active in the binding
uris:
- http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
identifiers:
- id: Antigen_sequence
type: Other
names:
- amino acid sequence
- AA sequence
- epitope amino acid sequence
- epitope AA sequence
description: amino acid sequence
license: CC BY 4.0
links:
- url: https://doi.org/10.1093/nar/gky1006
description: corresponding publication
- url: https://doi.org/10.1093/nar/gkx346
description: corresponding publication
- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
description: data source
num_points: 3159
bibtex:
- |-
@article{Vita2018,
doi = {10.1093/nar/gky1006},
url = {https://doi.org/10.1093/nar/gky1006},
year = {2018},
month = oct,
publisher = {Oxford University Press (OUP)},
volume = {47},
number = {D1},
pages = {D339--D343}},
author = {Randi Vita and Swapnil Mahajan and James A Overton and
Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
title = {The Immune Epitope Database (IEDB): 2018 update},
journal = {Nucleic Acids Research}
- |-
@article{Jespersen2017,
doi = {10.1093/nar/gkx346},
url = {https://doi.org/10.1093/nar/gkx346},
year = {2017},
month = may,
publisher = {Oxford University Press (OUP)},
volume = {45},
number = {W1},
pages = {W24--W29},
author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
conformational epitopes},
journal = {Nucleic Acids Research}
191 changes: 191 additions & 0 deletions data/iedb_jespersen_et_al/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import pandas as pd
import yaml
from tdc.single_pred import Epitope


def get_and_transform_data():
# get raw data
target_subfolder = "IEDB_Jespersen"
splits = Epitope(name=target_subfolder).get_split()
df_train = splits["train"]
df_valid = splits["valid"]
df_test = splits["test"]
df_train["split"] = "train"
df_valid["split"] = "valid"
df_test["split"] = "test"
df = pd.concat([df_train, df_valid, df_test], axis=0)

fn_data_raw = "data_raw.csv"
df.to_csv(fn_data_raw, index=False)
del df

def get_active_position(seq, active_position, sequence_only=False):
"""
Input: given a sequence and list of active index
Output: return active sequence and other sequence convert to _
MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
"""
if isinstance(
active_position, str
): # if list is casted to string after loading from raw csv data file.
active_position = [int(x) for x in active_position[1:-1].split(", ")]

if sequence_only:
_seq = "".join([seq[x] for x in active_position])
return _seq

_seq = ["_" for a in range(len(seq))]
for x in active_position:
_seq[x] = seq[x]
_seq = "".join(_seq)
return _seq

# proceed raw data
df = pd.read_csv(fn_data_raw, sep=",")
fields_orig = df.columns.tolist()
assert fields_orig == ["Antigen_ID", "Antigen", "Y", "split"]

# Rename columns of raw data
fields_clean = [
"Antigen_ID",
"Antigen_sequence",
"active_positions_indices",
"split",
]
df.columns = fields_clean

# get active position
antigen_seq = df.Antigen_sequence.tolist()
a_pos_ind_list = df.active_positions_indices.tolist()
df["active_position"] = [
get_active_position(x, o, sequence_only=True)
for x, o in zip(antigen_seq, a_pos_ind_list)
]

# save data to original
fn_data_original = "data_original.csv"
df.to_csv(fn_data_original, index=None)
df = pd.read_csv(fn_data_original, sep=",")
fields_orig = df.columns.tolist()
assert fields_orig == [
"Antigen_ID",
"Antigen_sequence",
"active_positions_indices",
"split",
"active_position",
]

# get right columns

df = df[["Antigen_sequence", "active_position"]]
fields_clean = ["Antigen_sequence", "active_position"]
df.columns = fields_clean
assert fields_orig != fields_clean
assert not df.duplicated().sum()
# save to csv
fn_data_csv = "data_clean.csv"
df.to_csv(fn_data_csv, index=False)

meta = {
"name": "iedb_jespersen_et_al",
"description": """Epitope prediction is to predict the active region in the antigen.
This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
epitopes and non-epitope amino acids determined from crystal structures.""",
"targets": [
{
"id": "active_position", # name of the column in a tabular dataset
"description": "amino acids sequence position that is active in binding",
"units": None, # units of the values in this column (leave empty if unitless)
"type": "categorical", # can be "categorical", "ordinal", "continuous"
"names": [ # names for the property (to sample from for building the prompts)
{"noun": "epitope"},
{"noun": "amino acids sequence active in the antigen binding"},
{"noun": "epitope sequence active in the antigen binding"},
{"noun": "epitope sequence active in the binding"},
],
"uris": [
"http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",
],
}
],
"identifiers": [
{
"id": "Antigen_sequence", # column name
"type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other"
"names": [
"amino acid sequence",
"AA sequence",
"epitope amino acid sequence",
"epitope AA sequence",
],
"description": "amino acid sequence", # d
}
],
"license": "CC BY 4.0", # license under which the original dataset was published
"links": [ # list of relevant links (original dataset, other uses, etc.)
{
"url": "https://doi.org/10.1093/nar/gky1006",
"description": "corresponding publication",
},
{
"url": "https://doi.org/10.1093/nar/gkx346",
"description": "corresponding publication",
},
{
"url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al",
"description": "data source",
},
],
"num_points": len(df), # number of datapoints in this dataset
"bibtex": [
"""@article{Vita2018,
doi = {10.1093/nar/gky1006},
url = {https://doi.org/10.1093/nar/gky1006},
year = {2018},
month = oct,
publisher = {Oxford University Press (OUP)},
volume = {47},
number = {D1},
pages = {D339--D343}},
author = {Randi Vita and Swapnil Mahajan and James A Overton and
Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
title = {The Immune Epitope Database (IEDB): 2018 update},
journal = {Nucleic Acids Research}""",
"""@article{Jespersen2017,
doi = {10.1093/nar/gkx346},
url = {https://doi.org/10.1093/nar/gkx346},
year = {2017},
month = may,
publisher = {Oxford University Press (OUP)},
volume = {45},
number = {W1},
pages = {W24--W29},
author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
conformational epitopes},
journal = {Nucleic Acids Research}""",
],
}

def str_presenter(dumper, data):
"""configures yaml for dumping multiline strings
Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
"""
if data.count("\n") > 0: # check for multiline string
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)

yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
str, str_presenter
) # to use with safe_dum
fn_meta = "meta.yaml"
with open(fn_meta, "w") as f:
yaml.dump(meta, f, sort_keys=False)

print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
get_and_transform_data()
64 changes: 64 additions & 0 deletions data/pdb_jespersen_et_al/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
name: pdb_jespersen_et_al
description: |-
Epitope prediction is to predict the active region in the antigen.
This dataset is from Bepipred, which curates a dataset from PDB.
It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.
targets:
- id: active_position
description: amino acids sequence position that is active in binding
units:
type: categorical
names:
- noun: epitope
- noun: amino acids sequence active in antigen binding
- noun: epitope sequence active in antigen binding
- noun: epitope sequence active in binding
uris:
- http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
identifiers:
- id: Antigen_sequence
type: Other
names:
- amino acid sequence
- AA sequence
- epitope amino acid sequence
- epitope AA sequence
description: amino acid sequence
license: CC BY 4.0
links:
- url: https://doi.org/10.1093/nar/gkx346
description: corresponding publication
- url: https://doi.org/10.1093/nar/28.1.235
description: corresponding publication
- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
description: data source
num_points: 447
bibtex:
- |-
@article{Jespersen2017,
doi = {10.1093/nar/gkx346},
url = {https://doi.org/10.1093/nar/gkx346},
year = {2017},
month = may,
publisher = {Oxford University Press (OUP)},
volume = {45},
number = {W1},
pages = {W24--W29},
author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
conformational epitopes},
journal = {Nucleic Acids Research}
- |-
@article{Berman2000,
doi = {10.1093/nar/28.1.235},
url = {https://doi.org/10.1093/nar/28.1.235},
year = {2000},
month = jan,
publisher = {Oxford University Press (OUP)},
volume = {28},
number = {1},
pages = {235--242},
author = {H. M. Berman},
title = {The Protein Data Bank},
journal = {Nucleic Acids Research}
Loading