From 6c146970e229e97c5266c2dc0c073387c54953f9 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 11 Mar 2023 12:34:34 +0200 Subject: [PATCH 01/13] Add Epitope data from TDC I add Epitope data from tdc here : https://tdcommons.ai/single_pred_tasks/epitope/ I will need help in validation my approach, need to ensure the indices start with 0 or 1 in epitope active binding one --- data/IEDB_Jespersen_et_al/meta.yaml | 54 +++++++++ data/IEDB_Jespersen_et_al/transform.py | 151 +++++++++++++++++++++++++ data/PDB_Jespersen_et_al/meta.yaml | 54 +++++++++ data/PDB_Jespersen_et_al/transform.py | 151 +++++++++++++++++++++++++ 4 files changed, 410 insertions(+) create mode 100644 data/IEDB_Jespersen_et_al/meta.yaml create mode 100644 data/IEDB_Jespersen_et_al/transform.py create mode 100644 data/PDB_Jespersen_et_al/meta.yaml create mode 100644 data/PDB_Jespersen_et_al/transform.py diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml new file mode 100644 index 000000000..9d39d1c3f --- /dev/null +++ b/data/IEDB_Jespersen_et_al/meta.yaml @@ -0,0 +1,54 @@ +name: IEDB_Jespersen_et_al +description: Epitope prediction is to predict the active region in the antigen. This + dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell + epitopes and non-epitope amino acids determined from crystal structures. +targets: +- id: active_position + description: amino acids sequence position that is active in binding + units: '' + type: Other + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 +identifiers: +- id: Antigen_sequence + type: Other + description: amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1093/nar/gky1006 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al + description: data source +num_points: 3159 +bibtex: +- |- + @article{Vita2018, + doi = {10.1093/nar/gky1006}, + url = {https://doi.org/10.1093/nar/gky1006}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D339--D343}, + author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, + title = {The Immune Epitope Database ({IEDB}): 2018 update}, + journal = {Nucleic Acids Research}} +- |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}} diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py new file mode 100644 index 000000000..0308c54f5 --- /dev/null +++ b/data/IEDB_Jespersen_et_al/transform.py @@ -0,0 +1,151 @@ +import pandas as pd +import yaml +from tdc.single_pred import Epitope + +def get_and_transform_data(): + # get raw data + target_folder = 'IEDB_Jespersen_et_al' + target_subfolder = 'IEDB_Jespersen' + data = Epitope(name = target_subfolder) + def get_active_position(seq, active_poisition, sequence_only=False): + ''' + Input: given a sequence and list of active index + Output: return active sequence and other sequence convert to _ + MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ + ''' + if sequence_only: + _seq = ''.join([seq[x] for x in active_poisition]) + return _seq + _seq = ['_' for a in range(len(seq))] + for x in active_poisition: + _seq[x] = seq[x] + _seq = ''.join(_seq) + return _seq + + df = pd.read_pickle('data/iedb_jespersen.pkl') + fields_orig = df.columns.tolist() + assert fields_orig == ['ID', 'X', 'Y'] + + + #Rename columns of raw data + fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices'] + df.columns = fields_clean + + #get active position + antigen_seq = df.Antigen_sequence.tolist() + a_pos_ind_list = df.active_positions_indices.tolist() + df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)] + + # save data to original + fn_data_original = 'data_original.csv' + df.to_csv(fn_data_original,index=None) + df = pd.read_csv(fn_data_original, sep=',') + fields_orig = df.columns.tolist() + assert fields_orig == ['Antigen_ID', + 'Antigen_sequence', + 'active_positions_indices', + 'active_position'] + + # get right columns + + df = df[['Antigen_sequence', 'active_position']] + fields_clean = ['Antigen_sequence', 'active_position'] + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + + meta = { + "name": f"{target_folder}", # unique identifier, we will also use this for directory names + "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", + "targets": [ + { + "id": "active_position", # name of the column in a tabular dataset + "description": "amino acids sequence position that is active in binding", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "Other", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "amino acids sequence active in binding", + "Epitope" + ], + "uris":[ + "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", + ], + } + ], + "identifiers": [ + { + "id": "Antigen_sequence", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "amino acid sequence", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1093/nar/gky1006", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gkx346", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al", + "description": "data source", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Vita2018, + doi = {10.1093/nar/gky1006}, + url = {https://doi.org/10.1093/nar/gky1006}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D339--D343}, + author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, + title = {The Immune Epitope Database ({IEDB}): 2018 update}, + journal = {Nucleic Acids Research}}""", + + """@article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}}""", + + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml new file mode 100644 index 000000000..d771ee0e4 --- /dev/null +++ b/data/PDB_Jespersen_et_al/meta.yaml @@ -0,0 +1,54 @@ +name: PDB_Jespersen_et_al +description: Epitope prediction is to predict the active region in the antigen. This + dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes + and non-epitope amino acids determined from crystal structures. +targets: +- id: active_position + description: '' + units: amino acids sequence position that is active in binding + type: Other + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 +identifiers: +- id: Antigen_sequence + type: Other + description: amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication +- url: https://doi.org/10.1093/nar/28.1.235 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al + description: data source +num_points: 447 +bibtex: +- |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}} +- |- + @article{Berman2000, + doi = {10.1093/nar/28.1.235}, + url = {https://doi.org/10.1093/nar/28.1.235}, + year = {2000}, + month = jan, + publisher = {Oxford University Press ({OUP})}, + volume = {28}, + number = {1}, + pages = {235--242}, + author = {H. M. Berman}, + title = {The Protein Data Bank}, + journal = {Nucleic Acids Research}} diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py new file mode 100644 index 000000000..afb592343 --- /dev/null +++ b/data/PDB_Jespersen_et_al/transform.py @@ -0,0 +1,151 @@ +import pandas as pd +import yaml +from tdc.single_pred import Epitope + +def get_and_transform_data(): + # get raw data + target_folder = 'PDB_Jespersen_et_al' + target_subfolder = 'PDB_Jespersen' + data = Epitope(name = target_subfolder) + + def get_active_position(seq, active_poisition, sequence_only=False): + ''' + Input: given a sequence and list of active index + Output: return active sequence and other sequence convert to _ + MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ + ''' + if sequence_only: + _seq = ''.join([seq[x] for x in active_poisition]) + return _seq + _seq = ['_' for a in range(len(seq))] + for x in active_poisition: + _seq[x] = seq[x] + _seq = ''.join(_seq) + return _seq + + df = pd.read_pickle('data/pdb_jespersen.pkl') + fields_orig = df.columns.tolist() + assert fields_orig == ['ID', 'X', 'Y'] + + #Rename columns of raw data + fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices'] + df.columns = fields_clean + + #get active position + antigen_seq = df.Antigen_sequence.tolist() + a_pos_ind_list = df.active_positions_indices.tolist() + df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)] + + # save data to original + fn_data_original = 'data_original.csv' + df.to_csv(fn_data_original,index=None) + df = pd.read_csv(fn_data_original, sep=',') + fields_orig = df.columns.tolist() + assert fields_orig == ['Antigen_ID', + 'Antigen_sequence', + 'active_positions_indices', + 'active_position'] + + # get right columns + + df = df[['Antigen_sequence', 'active_position']] + fields_clean = ['Antigen_sequence', 'active_position'] + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": f"{target_folder}", # unique identifier, we will also use this for directory names + "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", + "targets": [ + { + "id": "active_position", # name of the column in a tabular dataset + "description": "", # description of what this column means + "units": "amino acids sequence position that is active in binding", # units of the values in this column (leave empty if unitless) + "type": "Other", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "amino acids sequence active in binding", + "Epitope" + ], + "uris":[ + "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", + ], + } + ], + + "identifiers": [ + { + "id": "Antigen_sequence", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "amino acid sequence", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1093/nar/gkx346", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/28.1.235", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al", + "description": "data source", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}}""", + + """@article{Berman2000, + doi = {10.1093/nar/28.1.235}, + url = {https://doi.org/10.1093/nar/28.1.235}, + year = {2000}, + month = jan, + publisher = {Oxford University Press ({OUP})}, + volume = {28}, + number = {1}, + pages = {235--242}, + author = {H. M. Berman}, + title = {The Protein Data Bank}, + journal = {Nucleic Acids Research}}""", + + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + +if __name__ == "__main__": + get_and_transform_data() From 33933b23012fd36dd2738df17190fd53c6fbbb84 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Mar 2023 10:35:10 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/IEDB_Jespersen_et_al/meta.yaml | 94 +++++++++++++------------- data/IEDB_Jespersen_et_al/transform.py | 85 ++++++++++++----------- data/PDB_Jespersen_et_al/meta.yaml | 94 +++++++++++++------------- data/PDB_Jespersen_et_al/transform.py | 91 +++++++++++++------------ 4 files changed, 185 insertions(+), 179 deletions(-) diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml index 9d39d1c3f..70430fb05 100644 --- a/data/IEDB_Jespersen_et_al/meta.yaml +++ b/data/IEDB_Jespersen_et_al/meta.yaml @@ -1,54 +1,54 @@ +--- name: IEDB_Jespersen_et_al -description: Epitope prediction is to predict the active region in the antigen. This - dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell - epitopes and non-epitope amino acids determined from crystal structures. +description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects + B-cell epitopes and non-epitope amino acids determined from crystal structures. targets: -- id: active_position - description: amino acids sequence position that is active in binding - units: '' - type: Other - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 + - id: active_position + description: amino acids sequence position that is active in binding + units: '' + type: Other + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 identifiers: -- id: Antigen_sequence - type: Other - description: amino acid sequence + - id: Antigen_sequence + type: Other + description: amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1093/nar/gky1006 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al - description: data source + - url: https://doi.org/10.1093/nar/gky1006 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al + description: data source num_points: 3159 bibtex: -- |- - @article{Vita2018, - doi = {10.1093/nar/gky1006}, - url = {https://doi.org/10.1093/nar/gky1006}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D339--D343}, - author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, - title = {The Immune Epitope Database ({IEDB}): 2018 update}, - journal = {Nucleic Acids Research}} -- |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}} + - |- + @article{Vita2018, + doi = {10.1093/nar/gky1006}, + url = {https://doi.org/10.1093/nar/gky1006}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D339--D343}, + author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, + title = {The Immune Epitope Database ({IEDB}): 2018 update}, + journal = {Nucleic Acids Research}} + - |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}} diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py index 0308c54f5..9dfa986b3 100644 --- a/data/IEDB_Jespersen_et_al/transform.py +++ b/data/IEDB_Jespersen_et_al/transform.py @@ -2,61 +2,65 @@ import yaml from tdc.single_pred import Epitope + def get_and_transform_data(): # get raw data - target_folder = 'IEDB_Jespersen_et_al' - target_subfolder = 'IEDB_Jespersen' - data = Epitope(name = target_subfolder) + target_folder = "IEDB_Jespersen_et_al" + target_subfolder = "IEDB_Jespersen" + data = Epitope(name=target_subfolder) + def get_active_position(seq, active_poisition, sequence_only=False): - ''' - Input: given a sequence and list of active index + """ + Input: given a sequence and list of active index Output: return active sequence and other sequence convert to _ MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ - ''' - if sequence_only: - _seq = ''.join([seq[x] for x in active_poisition]) + """ + if sequence_only: + _seq = "".join([seq[x] for x in active_poisition]) return _seq - _seq = ['_' for a in range(len(seq))] + _seq = ["_" for a in range(len(seq))] for x in active_poisition: - _seq[x] = seq[x] - _seq = ''.join(_seq) + _seq[x] = seq[x] + _seq = "".join(_seq) return _seq - df = pd.read_pickle('data/iedb_jespersen.pkl') + df = pd.read_pickle("data/iedb_jespersen.pkl") fields_orig = df.columns.tolist() - assert fields_orig == ['ID', 'X', 'Y'] - - - #Rename columns of raw data - fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices'] + assert fields_orig == ["ID", "X", "Y"] + + # Rename columns of raw data + fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] df.columns = fields_clean - - #get active position + + # get active position antigen_seq = df.Antigen_sequence.tolist() a_pos_ind_list = df.active_positions_indices.tolist() - df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)] - + df["active_position"] = [ + get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + ] + # save data to original - fn_data_original = 'data_original.csv' - df.to_csv(fn_data_original,index=None) - df = pd.read_csv(fn_data_original, sep=',') + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ['Antigen_ID', - 'Antigen_sequence', - 'active_positions_indices', - 'active_position'] - + assert fields_orig == [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "active_position", + ] + # get right columns - - df = df[['Antigen_sequence', 'active_position']] - fields_clean = ['Antigen_sequence', 'active_position'] + + df = df[["Antigen_sequence", "active_position"]] + fields_clean = ["Antigen_sequence", "active_position"] df.columns = fields_clean assert fields_orig != fields_clean assert not df.duplicated().sum() # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names @@ -69,9 +73,9 @@ def get_active_position(seq, active_poisition, sequence_only=False): "type": "Other", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "amino acids sequence active in binding", - "Epitope" + "Epitope", ], - "uris":[ + "uris": [ "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", ], } @@ -96,7 +100,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): { "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al", "description": "data source", - } + }, ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ @@ -111,8 +115,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): pages = {D339--D343}, author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, title = {The Immune Epitope Database ({IEDB}): 2018 update}, - journal = {Nucleic Acids Research}}""", - + journal = {Nucleic Acids Research}}""", """@article{Jespersen2017, doi = {10.1093/nar/gkx346}, url = {https://doi.org/10.1093/nar/gkx346}, @@ -124,11 +127,10 @@ def get_active_position(seq, active_poisition, sequence_only=False): pages = {W24--W29}, author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}}""", - + journal = {Nucleic Acids Research}}""", ], } - + def str_presenter(dumper, data): """configures yaml for dumping multiline strings Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data @@ -147,5 +149,6 @@ def str_presenter(dumper, data): print(f"Finished processing {meta['name']} dataset!") + if __name__ == "__main__": get_and_transform_data() diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml index d771ee0e4..4a0a78b54 100644 --- a/data/PDB_Jespersen_et_al/meta.yaml +++ b/data/PDB_Jespersen_et_al/meta.yaml @@ -1,54 +1,54 @@ +--- name: PDB_Jespersen_et_al -description: Epitope prediction is to predict the active region in the antigen. This - dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes - and non-epitope amino acids determined from crystal structures. +description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects + B-cell epitopes and non-epitope amino acids determined from crystal structures. targets: -- id: active_position - description: '' - units: amino acids sequence position that is active in binding - type: Other - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 + - id: active_position + description: '' + units: amino acids sequence position that is active in binding + type: Other + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 identifiers: -- id: Antigen_sequence - type: Other - description: amino acid sequence + - id: Antigen_sequence + type: Other + description: amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication -- url: https://doi.org/10.1093/nar/28.1.235 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al - description: data source + - url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication + - url: https://doi.org/10.1093/nar/28.1.235 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al + description: data source num_points: 447 bibtex: -- |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}} -- |- - @article{Berman2000, - doi = {10.1093/nar/28.1.235}, - url = {https://doi.org/10.1093/nar/28.1.235}, - year = {2000}, - month = jan, - publisher = {Oxford University Press ({OUP})}, - volume = {28}, - number = {1}, - pages = {235--242}, - author = {H. M. Berman}, - title = {The Protein Data Bank}, - journal = {Nucleic Acids Research}} + - |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press ({OUP})}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, + journal = {Nucleic Acids Research}} + - |- + @article{Berman2000, + doi = {10.1093/nar/28.1.235}, + url = {https://doi.org/10.1093/nar/28.1.235}, + year = {2000}, + month = jan, + publisher = {Oxford University Press ({OUP})}, + volume = {28}, + number = {1}, + pages = {235--242}, + author = {H. M. Berman}, + title = {The Protein Data Bank}, + journal = {Nucleic Acids Research}} diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py index afb592343..128ec11b5 100644 --- a/data/PDB_Jespersen_et_al/transform.py +++ b/data/PDB_Jespersen_et_al/transform.py @@ -2,61 +2,66 @@ import yaml from tdc.single_pred import Epitope + def get_and_transform_data(): # get raw data - target_folder = 'PDB_Jespersen_et_al' - target_subfolder = 'PDB_Jespersen' - data = Epitope(name = target_subfolder) - + target_folder = "PDB_Jespersen_et_al" + target_subfolder = "PDB_Jespersen" + data = Epitope(name=target_subfolder) + def get_active_position(seq, active_poisition, sequence_only=False): - ''' - Input: given a sequence and list of active index + """ + Input: given a sequence and list of active index Output: return active sequence and other sequence convert to _ MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ - ''' - if sequence_only: - _seq = ''.join([seq[x] for x in active_poisition]) + """ + if sequence_only: + _seq = "".join([seq[x] for x in active_poisition]) return _seq - _seq = ['_' for a in range(len(seq))] + _seq = ["_" for a in range(len(seq))] for x in active_poisition: - _seq[x] = seq[x] - _seq = ''.join(_seq) + _seq[x] = seq[x] + _seq = "".join(_seq) return _seq - df = pd.read_pickle('data/pdb_jespersen.pkl') + df = pd.read_pickle("data/pdb_jespersen.pkl") fields_orig = df.columns.tolist() - assert fields_orig == ['ID', 'X', 'Y'] - - #Rename columns of raw data - fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices'] + assert fields_orig == ["ID", "X", "Y"] + + # Rename columns of raw data + fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] df.columns = fields_clean - - #get active position + + # get active position antigen_seq = df.Antigen_sequence.tolist() a_pos_ind_list = df.active_positions_indices.tolist() - df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)] - + df["active_position"] = [ + get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + ] + # save data to original - fn_data_original = 'data_original.csv' - df.to_csv(fn_data_original,index=None) - df = pd.read_csv(fn_data_original, sep=',') + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ['Antigen_ID', - 'Antigen_sequence', - 'active_positions_indices', - 'active_position'] - + assert fields_orig == [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "active_position", + ] + # get right columns - - df = df[['Antigen_sequence', 'active_position']] - fields_clean = ['Antigen_sequence', 'active_position'] + + df = df[["Antigen_sequence", "active_position"]] + fields_clean = ["Antigen_sequence", "active_position"] df.columns = fields_clean assert fields_orig != fields_clean assert not df.duplicated().sum() # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - + meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", @@ -68,14 +73,13 @@ def get_active_position(seq, active_poisition, sequence_only=False): "type": "Other", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "amino acids sequence active in binding", - "Epitope" + "Epitope", ], - "uris":[ + "uris": [ "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", ], } ], - "identifiers": [ { "id": "Antigen_sequence", # column name @@ -96,11 +100,11 @@ def get_active_position(seq, active_poisition, sequence_only=False): { "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al", "description": "data source", - } + }, ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ - """@article{Jespersen2017, + """@article{Jespersen2017, doi = {10.1093/nar/gkx346}, url = {https://doi.org/10.1093/nar/gkx346}, year = {2017}, @@ -111,9 +115,8 @@ def get_active_position(seq, active_poisition, sequence_only=False): pages = {W24--W29}, author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}}""", - - """@article{Berman2000, + journal = {Nucleic Acids Research}}""", + """@article{Berman2000, doi = {10.1093/nar/28.1.235}, url = {https://doi.org/10.1093/nar/28.1.235}, year = {2000}, @@ -124,11 +127,10 @@ def get_active_position(seq, active_poisition, sequence_only=False): pages = {235--242}, author = {H. M. Berman}, title = {The Protein Data Bank}, - journal = {Nucleic Acids Research}}""", - + journal = {Nucleic Acids Research}}""", ], } - + def str_presenter(dumper, data): """configures yaml for dumping multiline strings Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data @@ -147,5 +149,6 @@ def str_presenter(dumper, data): print(f"Finished processing {meta['name']} dataset!") + if __name__ == "__main__": get_and_transform_data() From 334a832d8f59cc8cb12ae93ed050948a02cf6ed4 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 11 Mar 2023 16:33:12 +0200 Subject: [PATCH 03/13] Update transform.py --- data/PDB_Jespersen_et_al/transform.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py index 128ec11b5..a31702593 100644 --- a/data/PDB_Jespersen_et_al/transform.py +++ b/data/PDB_Jespersen_et_al/transform.py @@ -64,7 +64,9 @@ def get_active_position(seq, active_poisition, sequence_only=False): meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", + "description": """Epitope prediction is to predict the active region in the antigen. + This dataset is from Bepipred, which curates a dataset from PDB. + It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", "targets": [ { "id": "active_position", # name of the column in a tabular dataset From 3bb5d0e4e795c66d5e5d05fead5b824831548ae6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Mar 2023 14:33:19 +0000 Subject: [PATCH 04/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/PDB_Jespersen_et_al/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py index a31702593..dda7f298b 100644 --- a/data/PDB_Jespersen_et_al/transform.py +++ b/data/PDB_Jespersen_et_al/transform.py @@ -64,7 +64,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Epitope prediction is to predict the active region in the antigen. + "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", "targets": [ From ac9dd66d1e67df0adce220225211426d3d9e74f5 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 11 Mar 2023 16:37:23 +0200 Subject: [PATCH 05/13] Update transform.py --- data/PDB_Jespersen_et_al/transform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py index dda7f298b..ec27020e0 100644 --- a/data/PDB_Jespersen_et_al/transform.py +++ b/data/PDB_Jespersen_et_al/transform.py @@ -70,15 +70,15 @@ def get_active_position(seq, active_poisition, sequence_only=False): "targets": [ { "id": "active_position", # name of the column in a tabular dataset - "description": "", # description of what this column means - "units": "amino acids sequence position that is active in binding", # units of the values in this column (leave empty if unitless) + "description": "amino acids sequence position that is active in binding", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) "type": "Other", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "amino acids sequence active in binding", "Epitope", ], "uris": [ - "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", + "https://rb.gy/l1st1c", ], } ], From 85e95a2eef60764fdc0dc24f33e655be2ef329b2 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Mon, 27 Mar 2023 19:37:23 +0200 Subject: [PATCH 06/13] Delete data/IEDB_Jespersen_et_al directory --- data/IEDB_Jespersen_et_al/meta.yaml | 54 --------- data/IEDB_Jespersen_et_al/transform.py | 154 ------------------------- 2 files changed, 208 deletions(-) delete mode 100644 data/IEDB_Jespersen_et_al/meta.yaml delete mode 100644 data/IEDB_Jespersen_et_al/transform.py diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml deleted file mode 100644 index 70430fb05..000000000 --- a/data/IEDB_Jespersen_et_al/meta.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -name: IEDB_Jespersen_et_al -description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects - B-cell epitopes and non-epitope amino acids determined from crystal structures. -targets: - - id: active_position - description: amino acids sequence position that is active in binding - units: '' - type: Other - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 -identifiers: - - id: Antigen_sequence - type: Other - description: amino acid sequence -license: CC BY 4.0 -links: - - url: https://doi.org/10.1093/nar/gky1006 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al - description: data source -num_points: 3159 -bibtex: - - |- - @article{Vita2018, - doi = {10.1093/nar/gky1006}, - url = {https://doi.org/10.1093/nar/gky1006}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D339--D343}, - author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, - title = {The Immune Epitope Database ({IEDB}): 2018 update}, - journal = {Nucleic Acids Research}} - - |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}} diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py deleted file mode 100644 index 9dfa986b3..000000000 --- a/data/IEDB_Jespersen_et_al/transform.py +++ /dev/null @@ -1,154 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import Epitope - - -def get_and_transform_data(): - # get raw data - target_folder = "IEDB_Jespersen_et_al" - target_subfolder = "IEDB_Jespersen" - data = Epitope(name=target_subfolder) - - def get_active_position(seq, active_poisition, sequence_only=False): - """ - Input: given a sequence and list of active index - Output: return active sequence and other sequence convert to _ - MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ - """ - if sequence_only: - _seq = "".join([seq[x] for x in active_poisition]) - return _seq - _seq = ["_" for a in range(len(seq))] - for x in active_poisition: - _seq[x] = seq[x] - _seq = "".join(_seq) - return _seq - - df = pd.read_pickle("data/iedb_jespersen.pkl") - fields_orig = df.columns.tolist() - assert fields_orig == ["ID", "X", "Y"] - - # Rename columns of raw data - fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] - df.columns = fields_clean - - # get active position - antigen_seq = df.Antigen_sequence.tolist() - a_pos_ind_list = df.active_positions_indices.tolist() - df["active_position"] = [ - get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) - ] - - # save data to original - fn_data_original = "data_original.csv" - df.to_csv(fn_data_original, index=None) - df = pd.read_csv(fn_data_original, sep=",") - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Antigen_ID", - "Antigen_sequence", - "active_positions_indices", - "active_position", - ] - - # get right columns - - df = df[["Antigen_sequence", "active_position"]] - fields_clean = ["Antigen_sequence", "active_position"] - df.columns = fields_clean - assert fields_orig != fields_clean - assert not df.duplicated().sum() - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - meta = { - "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", - "targets": [ - { - "id": "active_position", # name of the column in a tabular dataset - "description": "amino acids sequence position that is active in binding", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "Other", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "amino acids sequence active in binding", - "Epitope", - ], - "uris": [ - "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", - ], - } - ], - "identifiers": [ - { - "id": "Antigen_sequence", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "amino acid sequence", # description (optional, except for "Other") - } - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1093/nar/gky1006", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gkx346", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Vita2018, - doi = {10.1093/nar/gky1006}, - url = {https://doi.org/10.1093/nar/gky1006}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D339--D343}, - author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, - title = {The Immune Epitope Database ({IEDB}): 2018 update}, - journal = {Nucleic Acids Research}}""", - """@article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() From bd64e79aa04d4c976dcf2bd3fc12bad6b90b9f27 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Mon, 27 Mar 2023 19:37:31 +0200 Subject: [PATCH 07/13] Delete data/PDB_Jespersen_et_al directory --- data/PDB_Jespersen_et_al/meta.yaml | 54 --------- data/PDB_Jespersen_et_al/transform.py | 156 -------------------------- 2 files changed, 210 deletions(-) delete mode 100644 data/PDB_Jespersen_et_al/meta.yaml delete mode 100644 data/PDB_Jespersen_et_al/transform.py diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml deleted file mode 100644 index 4a0a78b54..000000000 --- a/data/PDB_Jespersen_et_al/meta.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -name: PDB_Jespersen_et_al -description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects - B-cell epitopes and non-epitope amino acids determined from crystal structures. -targets: - - id: active_position - description: '' - units: amino acids sequence position that is active in binding - type: Other - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 -identifiers: - - id: Antigen_sequence - type: Other - description: amino acid sequence -license: CC BY 4.0 -links: - - url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication - - url: https://doi.org/10.1093/nar/28.1.235 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al - description: data source -num_points: 447 -bibtex: - - |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}} - - |- - @article{Berman2000, - doi = {10.1093/nar/28.1.235}, - url = {https://doi.org/10.1093/nar/28.1.235}, - year = {2000}, - month = jan, - publisher = {Oxford University Press ({OUP})}, - volume = {28}, - number = {1}, - pages = {235--242}, - author = {H. M. Berman}, - title = {The Protein Data Bank}, - journal = {Nucleic Acids Research}} diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py deleted file mode 100644 index ec27020e0..000000000 --- a/data/PDB_Jespersen_et_al/transform.py +++ /dev/null @@ -1,156 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import Epitope - - -def get_and_transform_data(): - # get raw data - target_folder = "PDB_Jespersen_et_al" - target_subfolder = "PDB_Jespersen" - data = Epitope(name=target_subfolder) - - def get_active_position(seq, active_poisition, sequence_only=False): - """ - Input: given a sequence and list of active index - Output: return active sequence and other sequence convert to _ - MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ - """ - if sequence_only: - _seq = "".join([seq[x] for x in active_poisition]) - return _seq - _seq = ["_" for a in range(len(seq))] - for x in active_poisition: - _seq[x] = seq[x] - _seq = "".join(_seq) - return _seq - - df = pd.read_pickle("data/pdb_jespersen.pkl") - fields_orig = df.columns.tolist() - assert fields_orig == ["ID", "X", "Y"] - - # Rename columns of raw data - fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] - df.columns = fields_clean - - # get active position - antigen_seq = df.Antigen_sequence.tolist() - a_pos_ind_list = df.active_positions_indices.tolist() - df["active_position"] = [ - get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) - ] - - # save data to original - fn_data_original = "data_original.csv" - df.to_csv(fn_data_original, index=None) - df = pd.read_csv(fn_data_original, sep=",") - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Antigen_ID", - "Antigen_sequence", - "active_positions_indices", - "active_position", - ] - - # get right columns - - df = df[["Antigen_sequence", "active_position"]] - fields_clean = ["Antigen_sequence", "active_position"] - df.columns = fields_clean - assert fields_orig != fields_clean - assert not df.duplicated().sum() - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - meta = { - "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Epitope prediction is to predict the active region in the antigen. - This dataset is from Bepipred, which curates a dataset from PDB. - It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", - "targets": [ - { - "id": "active_position", # name of the column in a tabular dataset - "description": "amino acids sequence position that is active in binding", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "Other", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "amino acids sequence active in binding", - "Epitope", - ], - "uris": [ - "https://rb.gy/l1st1c", - ], - } - ], - "identifiers": [ - { - "id": "Antigen_sequence", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "amino acid sequence", # description (optional, except for "Other") - } - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1093/nar/gkx346", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/28.1.235", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes}, - journal = {Nucleic Acids Research}}""", - """@article{Berman2000, - doi = {10.1093/nar/28.1.235}, - url = {https://doi.org/10.1093/nar/28.1.235}, - year = {2000}, - month = jan, - publisher = {Oxford University Press ({OUP})}, - volume = {28}, - number = {1}, - pages = {235--242}, - author = {H. M. Berman}, - title = {The Protein Data Bank}, - journal = {Nucleic Acids Research}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() From 57e301c64dfa0a8ce0c1c3a426d46518bd5e0175 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Mon, 27 Mar 2023 19:37:49 +0200 Subject: [PATCH 08/13] Add files via upload --- data/iedb_jespersen_et_al/meta.yaml | 67 ++++++++++ data/iedb_jespersen_et_al/transform.py | 172 +++++++++++++++++++++++++ data/pdb_jespersen_et_al/meta.yaml | 65 ++++++++++ data/pdb_jespersen_et_al/transform.py | 170 ++++++++++++++++++++++++ 4 files changed, 474 insertions(+) create mode 100644 data/iedb_jespersen_et_al/meta.yaml create mode 100644 data/iedb_jespersen_et_al/transform.py create mode 100644 data/pdb_jespersen_et_al/meta.yaml create mode 100644 data/pdb_jespersen_et_al/transform.py diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml new file mode 100644 index 000000000..282cc264b --- /dev/null +++ b/data/iedb_jespersen_et_al/meta.yaml @@ -0,0 +1,67 @@ +name: iedb_jespersen_et_al +description: |- + Epitope prediction is to predict the active region in the antigen. + This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell + epitopes and non-epitope amino acids determined from crystal structures. +targets: +- id: active_position + description: amino acids sequence position that is active in binding + units: '' + type: categorical + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: Antigen_sequence + type: Other + names: + - amino acid sequence + - FASTQ + - fastq sequence + - Protien sequence + description: amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1093/nar/gky1006 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al + description: data source +num_points: 3159 +bibtex: +- |- + @article{Vita2018, + doi = {10.1093/nar/gky1006}, + url = {https://doi.org/10.1093/nar/gky1006}, + year = {2018}, + month = oct, + publisher = {Oxford University Press (OUP)}, + volume = {47}, + number = {D1}, + pages = {D339--D343}}, + author = {Randi Vita and Swapnil Mahajan and James A Overton and + Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and + Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, + title = {The Immune Epitope Database (IEDB): 2018 update}, + journal = {Nucleic Acids Research} +- |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press (OUP)}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using + conformational epitopes}, + journal = {Nucleic Acids Research} diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py new file mode 100644 index 000000000..e9a0cd0df --- /dev/null +++ b/data/iedb_jespersen_et_al/transform.py @@ -0,0 +1,172 @@ +import pandas as pd +import yaml +from tdc.single_pred import Epitope + + +def get_and_transform_data(): + # get raw data + target_folder = "IEDB_Jespersen_et_al" + target_subfolder = "IEDB_Jespersen" + data = Epitope(name=target_subfolder) + + def get_active_position(seq, active_poisition, sequence_only=False): + """ + Input: given a sequence and list of active index + Output: return active sequence and other sequence convert to _ + MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ + """ + if sequence_only: + _seq = "".join([seq[x] for x in active_poisition]) + return _seq + _seq = ["_" for a in range(len(seq))] + for x in active_poisition: + _seq[x] = seq[x] + _seq = "".join(_seq) + return _seq + + df = pd.read_pickle("data/iedb_jespersen.pkl") + fields_orig = df.columns.tolist() + assert fields_orig == ["ID", "X", "Y"] + + # Rename columns of raw data + fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] + df.columns = fields_clean + + # get active position + antigen_seq = df.Antigen_sequence.tolist() + a_pos_ind_list = df.active_positions_indices.tolist() + df["active_position"] = [ + get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + ] + + # save data to original + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "active_position", + ] + + # get right columns + + df = df[["Antigen_sequence", "active_position"]] + fields_clean = ["Antigen_sequence", "active_position"] + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "iedb_jespersen_et_al", # unique identifier, we will also use this for directory names + "description": """Epitope prediction is to predict the active region in the antigen. +This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell +epitopes and non-epitope amino acids determined from crystal structures.""", + "targets": [ + { + "id": "active_position", # name of the column in a tabular dataset + "description": "amino acids sequence position that is active in binding", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "categorical", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "amino acids sequence active in binding", + "Epitope", + ], + "uris": [ + "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", + ], + } + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "Antigen_sequence", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "FASTQ", + "fastq sequence", + "Protien sequence" + ], + "description": "amino acid sequence", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1093/nar/gky1006", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gkx346", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Vita2018, +doi = {10.1093/nar/gky1006}, +url = {https://doi.org/10.1093/nar/gky1006}, +year = {2018}, +month = oct, +publisher = {Oxford University Press (OUP)}, +volume = {47}, +number = {D1}, +pages = {D339--D343}}, +author = {Randi Vita and Swapnil Mahajan and James A Overton and +Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and +Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, +title = {The Immune Epitope Database (IEDB): 2018 update}, +journal = {Nucleic Acids Research}""", + """@article{Jespersen2017, +doi = {10.1093/nar/gkx346}, +url = {https://doi.org/10.1093/nar/gkx346}, +year = {2017}, +month = may, +publisher = {Oxford University Press (OUP)}, +volume = {45}, +number = {W1}, +pages = {W24--W29}, +author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, +title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using +conformational epitopes}, +journal = {Nucleic Acids Research}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml new file mode 100644 index 000000000..4480baf47 --- /dev/null +++ b/data/pdb_jespersen_et_al/meta.yaml @@ -0,0 +1,65 @@ +name: pdb_jespersen_et_al +description: |- + Epitope prediction is to predict the active region in the antigen. + This dataset is from Bepipred, which curates a dataset from PDB. + It collects B-cell epitopes and non-epitope amino acids determined from crystal structures. +targets: +- id: active_position + description: amino acids sequence position that is active in binding + units: '' + type: categorical + names: + - amino acids sequence active in binding + - Epitope + uris: + - https://rb.gy/l1st1c +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: Antigen_sequence + type: Other + names: + - amino acid sequence + - FASTQ + - fastq sequence + - Protien sequence + description: amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication +- url: https://doi.org/10.1093/nar/28.1.235 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al + description: data source +num_points: 447 +bibtex: +- |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press (OUP)}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using + conformational epitopes}, + journal = {Nucleic Acids Research} +- |- + @article{Berman2000, + doi = {10.1093/nar/28.1.235}, + url = {https://doi.org/10.1093/nar/28.1.235}, + year = {2000}, + month = jan, + publisher = {Oxford University Press (OUP)}, + volume = {28}, + number = {1}, + pages = {235--242}, + author = {H. M. Berman}, + title = {The Protein Data Bank}, + journal = {Nucleic Acids Research} diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py new file mode 100644 index 000000000..3e21c2cad --- /dev/null +++ b/data/pdb_jespersen_et_al/transform.py @@ -0,0 +1,170 @@ +import pandas as pd +import yaml +from tdc.single_pred import Epitope + + +def get_and_transform_data(): + # get raw data + target_folder = "PDB_Jespersen_et_al" + target_subfolder = "PDB_Jespersen" + data = Epitope(name=target_subfolder) + + def get_active_position(seq, active_poisition, sequence_only=False): + """ + Input: given a sequence and list of active index + Output: return active sequence and other sequence convert to _ + MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ + """ + if sequence_only: + _seq = "".join([seq[x] for x in active_poisition]) + return _seq + _seq = ["_" for a in range(len(seq))] + for x in active_poisition: + _seq[x] = seq[x] + _seq = "".join(_seq) + return _seq + + df = pd.read_pickle("data/pdb_jespersen.pkl") + fields_orig = df.columns.tolist() + assert fields_orig == ["ID", "X", "Y"] + + # Rename columns of raw data + fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] + df.columns = fields_clean + + # get active position + antigen_seq = df.Antigen_sequence.tolist() + a_pos_ind_list = df.active_positions_indices.tolist() + df["active_position"] = [ + get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + ] + + # save data to original + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "active_position", + ] + + # get right columns + + df = df[["Antigen_sequence", "active_position"]] + fields_clean = ["Antigen_sequence", "active_position"] + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "pdb_jespersen_et_al", # unique identifier, we will also use this for directory names + "description": """Epitope prediction is to predict the active region in the antigen. +This dataset is from Bepipred, which curates a dataset from PDB. +It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", + "targets": [ + { + "id": "active_position", # name of the column in a tabular dataset + "description": "amino acids sequence position that is active in binding", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "categorical", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "amino acids sequence active in binding", + "Epitope", + ], + "uris": [ + "https://rb.gy/l1st1c", + ], + } + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "Antigen_sequence", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "FASTQ", + "fastq sequence", + "Protien sequence" + ], + "description": "amino acid sequence", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1093/nar/gkx346", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/28.1.235", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Jespersen2017, +doi = {10.1093/nar/gkx346}, +url = {https://doi.org/10.1093/nar/gkx346}, +year = {2017}, +month = may, +publisher = {Oxford University Press (OUP)}, +volume = {45}, +number = {W1}, +pages = {W24--W29}, +author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, +title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using +conformational epitopes}, +journal = {Nucleic Acids Research}""", + """@article{Berman2000, +doi = {10.1093/nar/28.1.235}, +url = {https://doi.org/10.1093/nar/28.1.235}, +year = {2000}, +month = jan, +publisher = {Oxford University Press (OUP)}, +volume = {28}, +number = {1}, +pages = {235--242}, +author = {H. M. Berman}, +title = {The Protein Data Bank}, +journal = {Nucleic Acids Research}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() From 56113d9794a40b3fd172ba9c0071501cca8dfc07 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Wed, 29 Mar 2023 03:23:10 +0200 Subject: [PATCH 09/13] Remove benchmark field As data was complex and get in an indirect way. I didn't implement split --- data/iedb_jespersen_et_al/meta.yaml | 4 ---- data/iedb_jespersen_et_al/transform.py | 13 +++---------- data/pdb_jespersen_et_al/meta.yaml | 4 ---- data/pdb_jespersen_et_al/transform.py | 7 ------- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml index 282cc264b..6d577b3ad 100644 --- a/data/iedb_jespersen_et_al/meta.yaml +++ b/data/iedb_jespersen_et_al/meta.yaml @@ -13,10 +13,6 @@ targets: - Epitope uris: - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 -benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split identifiers: - id: Antigen_sequence type: Other diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py index e9a0cd0df..ff48d9568 100644 --- a/data/iedb_jespersen_et_al/transform.py +++ b/data/iedb_jespersen_et_al/transform.py @@ -63,14 +63,14 @@ def get_active_position(seq, active_poisition, sequence_only=False): df.to_csv(fn_data_csv, index=False) meta = { - "name": "iedb_jespersen_et_al", # unique identifier, we will also use this for directory names + "name": "iedb_jespersen_et_al", "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", "targets": [ { "id": "active_position", # name of the column in a tabular dataset - "description": "amino acids sequence position that is active in binding", # description of what this column means + "description": "amino acids sequence position that is active in binding", "units": "", # units of the values in this column (leave empty if unitless) "type": "categorical", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) @@ -82,13 +82,6 @@ def get_active_position(seq, active_poisition, sequence_only=False): ], } ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], "identifiers": [ { "id": "Antigen_sequence", # column name @@ -99,7 +92,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): "fastq sequence", "Protien sequence" ], - "description": "amino acid sequence", # description (optional, except for "Other") + "description": "amino acid sequence", # d } ], "license": "CC BY 4.0", # license under which the original dataset was published diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml index 4480baf47..48701c82d 100644 --- a/data/pdb_jespersen_et_al/meta.yaml +++ b/data/pdb_jespersen_et_al/meta.yaml @@ -13,10 +13,6 @@ targets: - Epitope uris: - https://rb.gy/l1st1c -benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split identifiers: - id: Antigen_sequence type: Other diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py index 3e21c2cad..7a4a77fb3 100644 --- a/data/pdb_jespersen_et_al/transform.py +++ b/data/pdb_jespersen_et_al/transform.py @@ -82,13 +82,6 @@ def get_active_position(seq, active_poisition, sequence_only=False): ], } ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], "identifiers": [ { "id": "Antigen_sequence", # column name From dd81fc743e36f4606d9e2ba92e66a7971595c769 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Wed, 19 Apr 2023 14:31:42 +0200 Subject: [PATCH 10/13] feat: iedb_jespersen_et_al clean up --- data/iedb_jespersen_et_al/meta.yaml | 113 +++++++++++++------------ data/iedb_jespersen_et_al/transform.py | 62 ++++++++++---- 2 files changed, 102 insertions(+), 73 deletions(-) diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml index 6d577b3ad..ca352405c 100644 --- a/data/iedb_jespersen_et_al/meta.yaml +++ b/data/iedb_jespersen_et_al/meta.yaml @@ -1,63 +1,66 @@ +--- name: iedb_jespersen_et_al description: |- - Epitope prediction is to predict the active region in the antigen. - This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell - epitopes and non-epitope amino acids determined from crystal structures. + Epitope prediction is to predict the active region in the antigen. + This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell + epitopes and non-epitope amino acids determined from crystal structures. targets: -- id: active_position - description: amino acids sequence position that is active in binding - units: '' - type: categorical - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189 + - id: active_position + description: amino acids sequence position that is active in binding + units: + type: categorical + names: + - epitope + - amino acids sequence active in antigen binding + - epitope sequence active in antigen binding + - epitope sequence active in binding + uris: + - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189 identifiers: -- id: Antigen_sequence - type: Other - names: - - amino acid sequence - - FASTQ - - fastq sequence - - Protien sequence - description: amino acid sequence + - id: Antigen_sequence + type: Other + names: + - amino acid sequence + - AA sequence + - epitope amino acid sequence + - epitope AA sequence + description: amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1093/nar/gky1006 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al - description: data source + - url: https://doi.org/10.1093/nar/gky1006 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al + description: data source num_points: 3159 bibtex: -- |- - @article{Vita2018, - doi = {10.1093/nar/gky1006}, - url = {https://doi.org/10.1093/nar/gky1006}, - year = {2018}, - month = oct, - publisher = {Oxford University Press (OUP)}, - volume = {47}, - number = {D1}, - pages = {D339--D343}}, - author = {Randi Vita and Swapnil Mahajan and James A Overton and - Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and - Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, - title = {The Immune Epitope Database (IEDB): 2018 update}, - journal = {Nucleic Acids Research} -- |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press (OUP)}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using - conformational epitopes}, - journal = {Nucleic Acids Research} + - |- + @article{Vita2018, + doi = {10.1093/nar/gky1006}, + url = {https://doi.org/10.1093/nar/gky1006}, + year = {2018}, + month = oct, + publisher = {Oxford University Press (OUP)}, + volume = {47}, + number = {D1}, + pages = {D339--D343}}, + author = {Randi Vita and Swapnil Mahajan and James A Overton and + Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and + Daniel K Wheeler and Alessandro Sette and Bjoern Peters}, + title = {The Immune Epitope Database (IEDB): 2018 update}, + journal = {Nucleic Acids Research} + - |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press (OUP)}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using + conformational epitopes}, + journal = {Nucleic Acids Research} diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py index ff48d9568..fe545a66e 100644 --- a/data/iedb_jespersen_et_al/transform.py +++ b/data/iedb_jespersen_et_al/transform.py @@ -5,38 +5,61 @@ def get_and_transform_data(): # get raw data - target_folder = "IEDB_Jespersen_et_al" target_subfolder = "IEDB_Jespersen" - data = Epitope(name=target_subfolder) + splits = Epitope(name=target_subfolder).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) - def get_active_position(seq, active_poisition, sequence_only=False): + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df + + def get_active_position(seq, active_position, sequence_only=False): """ Input: given a sequence and list of active index Output: return active sequence and other sequence convert to _ MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ """ + if isinstance( + active_position, str + ): # if list is casted to string after loading from raw csv data file. + active_position = [int(x) for x in active_position[1:-1].split(", ")] + if sequence_only: - _seq = "".join([seq[x] for x in active_poisition]) + _seq = "".join([seq[x] for x in active_position]) return _seq + _seq = ["_" for a in range(len(seq))] - for x in active_poisition: + for x in active_position: _seq[x] = seq[x] _seq = "".join(_seq) return _seq - df = pd.read_pickle("data/iedb_jespersen.pkl") + # proceed raw data + df = pd.read_csv(fn_data_raw, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ["ID", "X", "Y"] + assert fields_orig == ["Antigen_ID", "Antigen", "Y", "split"] # Rename columns of raw data - fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] + fields_clean = [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "split", + ] df.columns = fields_clean # get active position antigen_seq = df.Antigen_sequence.tolist() a_pos_ind_list = df.active_positions_indices.tolist() df["active_position"] = [ - get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + get_active_position(x, o, sequence_only=True) + for x, o in zip(antigen_seq, a_pos_ind_list) ] # save data to original @@ -48,6 +71,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): "Antigen_ID", "Antigen_sequence", "active_positions_indices", + "split", "active_position", ] @@ -63,7 +87,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): df.to_csv(fn_data_csv, index=False) meta = { - "name": "iedb_jespersen_et_al", + "name": "iedb_jespersen_et_al", "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""", @@ -71,14 +95,16 @@ def get_active_position(seq, active_poisition, sequence_only=False): { "id": "active_position", # name of the column in a tabular dataset "description": "amino acids sequence position that is active in binding", - "units": "", # units of the values in this column (leave empty if unitless) + "units": None, # units of the values in this column (leave empty if unitless) "type": "categorical", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "amino acids sequence active in binding", - "Epitope", + "epitope", + "amino acids sequence active in antigen binding", + "epitope sequence active in antigen binding", + "epitope sequence active in binding", ], "uris": [ - "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189", + "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189", ], } ], @@ -87,10 +113,10 @@ def get_active_position(seq, active_poisition, sequence_only=False): "id": "Antigen_sequence", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "names": [ - "amino acid sequence", - "FASTQ", - "fastq sequence", - "Protien sequence" + "amino acid sequence", + "AA sequence", + "epitope amino acid sequence", + "epitope AA sequence", ], "description": "amino acid sequence", # d } From 21f6ecf281c048cd25a7ec9626293c6d9e740114 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Wed, 19 Apr 2023 14:43:25 +0200 Subject: [PATCH 11/13] feat: pdb_jespersen_et_al clean up --- data/pdb_jespersen_et_al/meta.yaml | 109 +++++++++++++------------- data/pdb_jespersen_et_al/transform.py | 62 ++++++++++----- 2 files changed, 100 insertions(+), 71 deletions(-) diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml index 48701c82d..5fdcac996 100644 --- a/data/pdb_jespersen_et_al/meta.yaml +++ b/data/pdb_jespersen_et_al/meta.yaml @@ -1,61 +1,64 @@ +--- name: pdb_jespersen_et_al description: |- - Epitope prediction is to predict the active region in the antigen. - This dataset is from Bepipred, which curates a dataset from PDB. - It collects B-cell epitopes and non-epitope amino acids determined from crystal structures. + Epitope prediction is to predict the active region in the antigen. + This dataset is from Bepipred, which curates a dataset from PDB. + It collects B-cell epitopes and non-epitope amino acids determined from crystal structures. targets: -- id: active_position - description: amino acids sequence position that is active in binding - units: '' - type: categorical - names: - - amino acids sequence active in binding - - Epitope - uris: - - https://rb.gy/l1st1c + - id: active_position + description: amino acids sequence position that is active in binding + units: + type: categorical + names: + - epitope + - amino acids sequence active in antigen binding + - epitope sequence active in antigen binding + - epitope sequence active in binding + uris: + - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189 identifiers: -- id: Antigen_sequence - type: Other - names: - - amino acid sequence - - FASTQ - - fastq sequence - - Protien sequence - description: amino acid sequence + - id: Antigen_sequence + type: Other + names: + - amino acid sequence + - AA sequence + - epitope amino acid sequence + - epitope AA sequence + description: amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1093/nar/gkx346 - description: corresponding publication -- url: https://doi.org/10.1093/nar/28.1.235 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al - description: data source + - url: https://doi.org/10.1093/nar/gkx346 + description: corresponding publication + - url: https://doi.org/10.1093/nar/28.1.235 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al + description: data source num_points: 447 bibtex: -- |- - @article{Jespersen2017, - doi = {10.1093/nar/gkx346}, - url = {https://doi.org/10.1093/nar/gkx346}, - year = {2017}, - month = may, - publisher = {Oxford University Press (OUP)}, - volume = {45}, - number = {W1}, - pages = {W24--W29}, - author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, - title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using - conformational epitopes}, - journal = {Nucleic Acids Research} -- |- - @article{Berman2000, - doi = {10.1093/nar/28.1.235}, - url = {https://doi.org/10.1093/nar/28.1.235}, - year = {2000}, - month = jan, - publisher = {Oxford University Press (OUP)}, - volume = {28}, - number = {1}, - pages = {235--242}, - author = {H. M. Berman}, - title = {The Protein Data Bank}, - journal = {Nucleic Acids Research} + - |- + @article{Jespersen2017, + doi = {10.1093/nar/gkx346}, + url = {https://doi.org/10.1093/nar/gkx346}, + year = {2017}, + month = may, + publisher = {Oxford University Press (OUP)}, + volume = {45}, + number = {W1}, + pages = {W24--W29}, + author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili}, + title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using + conformational epitopes}, + journal = {Nucleic Acids Research} + - |- + @article{Berman2000, + doi = {10.1093/nar/28.1.235}, + url = {https://doi.org/10.1093/nar/28.1.235}, + year = {2000}, + month = jan, + publisher = {Oxford University Press (OUP)}, + volume = {28}, + number = {1}, + pages = {235--242}, + author = {H. M. Berman}, + title = {The Protein Data Bank}, + journal = {Nucleic Acids Research} diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py index 7a4a77fb3..fd9a38747 100644 --- a/data/pdb_jespersen_et_al/transform.py +++ b/data/pdb_jespersen_et_al/transform.py @@ -5,38 +5,61 @@ def get_and_transform_data(): # get raw data - target_folder = "PDB_Jespersen_et_al" target_subfolder = "PDB_Jespersen" - data = Epitope(name=target_subfolder) + splits = Epitope(name=target_subfolder).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) - def get_active_position(seq, active_poisition, sequence_only=False): + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df + + def get_active_position(seq, active_position, sequence_only=False): """ Input: given a sequence and list of active index Output: return active sequence and other sequence convert to _ MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_ """ + if isinstance( + active_position, str + ): # if list is casted to string after loading from raw csv data file. + active_position = [int(x) for x in active_position[1:-1].split(", ")] + if sequence_only: - _seq = "".join([seq[x] for x in active_poisition]) + _seq = "".join([seq[x] for x in active_position]) return _seq + _seq = ["_" for a in range(len(seq))] - for x in active_poisition: + for x in active_position: _seq[x] = seq[x] _seq = "".join(_seq) return _seq - df = pd.read_pickle("data/pdb_jespersen.pkl") + # proceed raw data + df = pd.read_csv(fn_data_raw, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ["ID", "X", "Y"] + assert fields_orig == ["Antigen_ID", "Antigen", "Y", "split"] # Rename columns of raw data - fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"] + fields_clean = [ + "Antigen_ID", + "Antigen_sequence", + "active_positions_indices", + "split", + ] df.columns = fields_clean # get active position antigen_seq = df.Antigen_sequence.tolist() a_pos_ind_list = df.active_positions_indices.tolist() df["active_position"] = [ - get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list) + get_active_position(x, o, sequence_only=True) + for x, o in zip(antigen_seq, a_pos_ind_list) ] # save data to original @@ -48,6 +71,7 @@ def get_active_position(seq, active_poisition, sequence_only=False): "Antigen_ID", "Antigen_sequence", "active_positions_indices", + "split", "active_position", ] @@ -70,15 +94,17 @@ def get_active_position(seq, active_poisition, sequence_only=False): "targets": [ { "id": "active_position", # name of the column in a tabular dataset - "description": "amino acids sequence position that is active in binding", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) + "description": "amino acids sequence position that is active in binding", + "units": None, # units of the values in this column (leave empty if unitless) "type": "categorical", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "amino acids sequence active in binding", - "Epitope", + "epitope", + "amino acids sequence active in antigen binding", + "epitope sequence active in antigen binding", + "epitope sequence active in binding", ], "uris": [ - "https://rb.gy/l1st1c", + "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189", ], } ], @@ -87,10 +113,10 @@ def get_active_position(seq, active_poisition, sequence_only=False): "id": "Antigen_sequence", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "names": [ - "amino acid sequence", - "FASTQ", - "fastq sequence", - "Protien sequence" + "amino acid sequence", + "AA sequence", + "epitope amino acid sequence", + "epitope AA sequence", ], "description": "amino acid sequence", # description (optional, except for "Other") } From e47fc44e137065b792abe4528e7671ead9f107e3 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Thu, 27 Apr 2023 14:12:38 +0200 Subject: [PATCH 12/13] feat: update new names setup for iedb_jespersen_et_al --- data/iedb_jespersen_et_al/meta.yaml | 8 ++++---- data/iedb_jespersen_et_al/transform.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml index ca352405c..72638e1fa 100644 --- a/data/iedb_jespersen_et_al/meta.yaml +++ b/data/iedb_jespersen_et_al/meta.yaml @@ -10,10 +10,10 @@ targets: units: type: categorical names: - - epitope - - amino acids sequence active in antigen binding - - epitope sequence active in antigen binding - - epitope sequence active in binding + - noun: epitope + - noun: amino acids sequence active in the antigen binding + - noun: epitope sequence active in the antigen binding + - noun: epitope sequence active in the binding uris: - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189 identifiers: diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py index fe545a66e..22472ba74 100644 --- a/data/iedb_jespersen_et_al/transform.py +++ b/data/iedb_jespersen_et_al/transform.py @@ -98,10 +98,10 @@ def get_active_position(seq, active_position, sequence_only=False): "units": None, # units of the values in this column (leave empty if unitless) "type": "categorical", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "epitope", - "amino acids sequence active in antigen binding", - "epitope sequence active in antigen binding", - "epitope sequence active in binding", + {"noun": "epitope"}, + {"noun": "amino acids sequence active in the antigen binding"}, + {"noun": "epitope sequence active in the antigen binding"}, + {"noun": "epitope sequence active in the binding"}, ], "uris": [ "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189", From 714ee93a1acc419e0913f498f6dbb1d31b7a11a2 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Thu, 27 Apr 2023 14:13:35 +0200 Subject: [PATCH 13/13] feat: update new names setup for pdb_jespersen_et_al --- data/pdb_jespersen_et_al/meta.yaml | 8 ++++---- data/pdb_jespersen_et_al/transform.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml index 5fdcac996..61df1008b 100644 --- a/data/pdb_jespersen_et_al/meta.yaml +++ b/data/pdb_jespersen_et_al/meta.yaml @@ -10,10 +10,10 @@ targets: units: type: categorical names: - - epitope - - amino acids sequence active in antigen binding - - epitope sequence active in antigen binding - - epitope sequence active in binding + - noun: epitope + - noun: amino acids sequence active in antigen binding + - noun: epitope sequence active in antigen binding + - noun: epitope sequence active in binding uris: - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189 identifiers: diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py index fd9a38747..c399f34cd 100644 --- a/data/pdb_jespersen_et_al/transform.py +++ b/data/pdb_jespersen_et_al/transform.py @@ -98,10 +98,10 @@ def get_active_position(seq, active_position, sequence_only=False): "units": None, # units of the values in this column (leave empty if unitless) "type": "categorical", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "epitope", - "amino acids sequence active in antigen binding", - "epitope sequence active in antigen binding", - "epitope sequence active in binding", + {"noun": "epitope"}, + {"noun": "amino acids sequence active in antigen binding"}, + {"noun": "epitope sequence active in antigen binding"}, + {"noun": "epitope sequence active in binding"}, ], "uris": [ "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",