From 6c146970e229e97c5266c2dc0c073387c54953f9 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:34:34 +0200
Subject: [PATCH 01/13] Add Epitope data from TDC

I add Epitope data from tdc here :
https://tdcommons.ai/single_pred_tasks/epitope/

I will need help in validation my approach, need to ensure the indices start with 0 or 1 in epitope active binding one
---
 data/IEDB_Jespersen_et_al/meta.yaml    |  54 +++++++++
 data/IEDB_Jespersen_et_al/transform.py | 151 +++++++++++++++++++++++++
 data/PDB_Jespersen_et_al/meta.yaml     |  54 +++++++++
 data/PDB_Jespersen_et_al/transform.py  | 151 +++++++++++++++++++++++++
 4 files changed, 410 insertions(+)
 create mode 100644 data/IEDB_Jespersen_et_al/meta.yaml
 create mode 100644 data/IEDB_Jespersen_et_al/transform.py
 create mode 100644 data/PDB_Jespersen_et_al/meta.yaml
 create mode 100644 data/PDB_Jespersen_et_al/transform.py

diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml
new file mode 100644
index 000000000..9d39d1c3f
--- /dev/null
+++ b/data/IEDB_Jespersen_et_al/meta.yaml
@@ -0,0 +1,54 @@
+name: IEDB_Jespersen_et_al
+description: Epitope prediction is to predict the active region in the antigen. This
+  dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
+  epitopes and non-epitope amino acids determined from crystal structures.
+targets:
+- id: active_position
+  description: amino acids sequence position that is active in binding
+  units: ''
+  type: Other
+  names:
+  - amino acids sequence active in binding
+  - Epitope
+  uris:
+  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+identifiers:
+- id: Antigen_sequence
+  type: Other
+  description: amino acid sequence
+license: CC BY 4.0
+links:
+- url: https://doi.org/10.1093/nar/gky1006
+  description: corresponding publication
+- url: https://doi.org/10.1093/nar/gkx346
+  description: corresponding publication
+- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
+  description: data source
+num_points: 3159
+bibtex:
+- |-
+  @article{Vita2018,
+                doi = {10.1093/nar/gky1006},
+                url = {https://doi.org/10.1093/nar/gky1006},
+                year = {2018},
+                month = oct,
+                publisher = {Oxford University Press ({OUP})},
+                volume = {47},
+                number = {D1},
+                pages = {D339--D343},
+                author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+                title = {The Immune Epitope Database ({IEDB}): 2018 update},
+                journal = {Nucleic Acids Research}}
+- |-
+  @article{Jespersen2017,
+                doi = {10.1093/nar/gkx346},
+                url = {https://doi.org/10.1093/nar/gkx346},
+                year = {2017},
+                month = may,
+                publisher = {Oxford University Press ({OUP})},
+                volume = {45},
+                number = {W1},
+                pages = {W24--W29},
+                author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+                title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+                journal = {Nucleic Acids Research}}
diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py
new file mode 100644
index 000000000..0308c54f5
--- /dev/null
+++ b/data/IEDB_Jespersen_et_al/transform.py
@@ -0,0 +1,151 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Epitope
+
+def get_and_transform_data():
+    # get raw data
+    target_folder = 'IEDB_Jespersen_et_al'
+    target_subfolder = 'IEDB_Jespersen'
+    data = Epitope(name = target_subfolder)
+    def get_active_position(seq, active_poisition, sequence_only=False):
+        '''
+        Input: given a sequence and list of active index 
+        Output: return active sequence and other sequence convert to _
+        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
+        '''
+        if sequence_only: 
+            _seq = ''.join([seq[x] for x in active_poisition])
+            return _seq
+        _seq = ['_' for a in range(len(seq))]
+        for x in active_poisition:
+            _seq[x] = seq[x]    
+        _seq = ''.join(_seq)
+        return _seq
+
+    df = pd.read_pickle('data/iedb_jespersen.pkl')
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['ID', 'X', 'Y']
+ 
+    
+    #Rename columns of raw data
+    fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices']
+    df.columns = fields_clean
+    
+    #get active position
+    antigen_seq = df.Antigen_sequence.tolist()
+    a_pos_ind_list = df.active_positions_indices.tolist()
+    df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)]
+    
+    # save data to original
+    fn_data_original = 'data_original.csv'
+    df.to_csv(fn_data_original,index=None)
+    df = pd.read_csv(fn_data_original, sep=',')
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['Antigen_ID',
+     'Antigen_sequence',
+     'active_positions_indices',
+     'active_position']
+    
+    # get right columns
+    
+    df = df[['Antigen_sequence', 'active_position']]
+    fields_clean = ['Antigen_sequence', 'active_position']
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+    assert not df.duplicated().sum()
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+ 
+
+    meta = {
+        "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
+        "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
+        "targets": [
+            {
+                "id": "active_position",  # name of the column in a tabular dataset
+                "description": "amino acids sequence position that is active in binding",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "Other",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "amino acids sequence active in binding",
+                    "Epitope"
+                ],
+                "uris":[
+                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
+                ],
+            }
+        ],
+        "identifiers": [
+            {
+                "id": "Antigen_sequence",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "description": "amino acid sequence",  # description (optional, except for "Other")
+            }
+        ],
+        "license": "CC BY 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.1093/nar/gky1006",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.1093/nar/gkx346",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al",
+                "description": "data source",
+            }
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{Vita2018,
+              doi = {10.1093/nar/gky1006},
+              url = {https://doi.org/10.1093/nar/gky1006},
+              year = {2018},
+              month = oct,
+              publisher = {Oxford University Press ({OUP})},
+              volume = {47},
+              number = {D1},
+              pages = {D339--D343},
+              author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+              title = {The Immune Epitope Database ({IEDB}): 2018 update},
+              journal = {Nucleic Acids Research}}""", 
+
+            """@article{Jespersen2017,
+              doi = {10.1093/nar/gkx346},
+              url = {https://doi.org/10.1093/nar/gkx346},
+              year = {2017},
+              month = may,
+              publisher = {Oxford University Press ({OUP})},
+              volume = {45},
+              number = {W1},
+              pages = {W24--W29},
+              author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+              title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+              journal = {Nucleic Acids Research}}""", 
+
+        ],
+    }
+    
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+if __name__ == "__main__":
+    get_and_transform_data()
diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml
new file mode 100644
index 000000000..d771ee0e4
--- /dev/null
+++ b/data/PDB_Jespersen_et_al/meta.yaml
@@ -0,0 +1,54 @@
+name: PDB_Jespersen_et_al
+description: Epitope prediction is to predict the active region in the antigen. This
+  dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes
+  and non-epitope amino acids determined from crystal structures.
+targets:
+- id: active_position
+  description: ''
+  units: amino acids sequence position that is active in binding
+  type: Other
+  names:
+  - amino acids sequence active in binding
+  - Epitope
+  uris:
+  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+identifiers:
+- id: Antigen_sequence
+  type: Other
+  description: amino acid sequence
+license: CC BY 4.0
+links:
+- url: https://doi.org/10.1093/nar/gkx346
+  description: corresponding publication
+- url: https://doi.org/10.1093/nar/28.1.235
+  description: corresponding publication
+- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
+  description: data source
+num_points: 447
+bibtex:
+- |-
+  @article{Jespersen2017,
+                doi = {10.1093/nar/gkx346},
+                url = {https://doi.org/10.1093/nar/gkx346},
+                year = {2017},
+                month = may,
+                publisher = {Oxford University Press ({OUP})},
+                volume = {45},
+                number = {W1},
+                pages = {W24--W29},
+                author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+                title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+                journal = {Nucleic Acids Research}}
+- |-
+  @article{Berman2000,
+                doi = {10.1093/nar/28.1.235},
+                url = {https://doi.org/10.1093/nar/28.1.235},
+                year = {2000},
+                month = jan,
+                publisher = {Oxford University Press ({OUP})},
+                volume = {28},
+                number = {1},
+                pages = {235--242},
+                author = {H. M. Berman},
+                title = {The Protein Data Bank},
+                journal = {Nucleic Acids Research}}
diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
new file mode 100644
index 000000000..afb592343
--- /dev/null
+++ b/data/PDB_Jespersen_et_al/transform.py
@@ -0,0 +1,151 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Epitope
+
+def get_and_transform_data():
+    # get raw data
+    target_folder = 'PDB_Jespersen_et_al'
+    target_subfolder = 'PDB_Jespersen'
+    data = Epitope(name = target_subfolder)
+    
+    def get_active_position(seq, active_poisition, sequence_only=False):
+        '''
+        Input: given a sequence and list of active index 
+        Output: return active sequence and other sequence convert to _
+        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
+        '''
+        if sequence_only: 
+            _seq = ''.join([seq[x] for x in active_poisition])
+            return _seq
+        _seq = ['_' for a in range(len(seq))]
+        for x in active_poisition:
+            _seq[x] = seq[x]    
+        _seq = ''.join(_seq)
+        return _seq
+
+    df = pd.read_pickle('data/pdb_jespersen.pkl')
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['ID', 'X', 'Y']
+    
+    #Rename columns of raw data
+    fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices']
+    df.columns = fields_clean
+    
+    #get active position
+    antigen_seq = df.Antigen_sequence.tolist()
+    a_pos_ind_list = df.active_positions_indices.tolist()
+    df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)]
+    
+    # save data to original
+    fn_data_original = 'data_original.csv'
+    df.to_csv(fn_data_original,index=None)
+    df = pd.read_csv(fn_data_original, sep=',')
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['Antigen_ID',
+     'Antigen_sequence',
+     'active_positions_indices',
+     'active_position']
+    
+    # get right columns
+    
+    df = df[['Antigen_sequence', 'active_position']]
+    fields_clean = ['Antigen_sequence', 'active_position']
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+    assert not df.duplicated().sum()
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+    
+    meta = {
+        "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
+        "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
+        "targets": [
+            {
+                "id": "active_position",  # name of the column in a tabular dataset
+                "description": "",  # description of what this column means
+                "units": "amino acids sequence position that is active in binding",  # units of the values in this column (leave empty if unitless)
+                "type": "Other",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "amino acids sequence active in binding",
+                    "Epitope"
+                ],
+                "uris":[
+                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
+                ],
+            }
+        ],
+
+        "identifiers": [
+            {
+                "id": "Antigen_sequence",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "description": "amino acid sequence",  # description (optional, except for "Other")
+            }
+        ],
+        "license": "CC BY 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.1093/nar/gkx346",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.1093/nar/28.1.235",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al",
+                "description": "data source",
+            }
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+        """@article{Jespersen2017,
+              doi = {10.1093/nar/gkx346},
+              url = {https://doi.org/10.1093/nar/gkx346},
+              year = {2017},
+              month = may,
+              publisher = {Oxford University Press ({OUP})},
+              volume = {45},
+              number = {W1},
+              pages = {W24--W29},
+              author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+              title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+              journal = {Nucleic Acids Research}}""", 
+
+        """@article{Berman2000,
+              doi = {10.1093/nar/28.1.235},
+              url = {https://doi.org/10.1093/nar/28.1.235},
+              year = {2000},
+              month = jan,
+              publisher = {Oxford University Press ({OUP})},
+              volume = {28},
+              number = {1},
+              pages = {235--242},
+              author = {H. M. Berman},
+              title = {The Protein Data Bank},
+              journal = {Nucleic Acids Research}}""", 
+
+        ],
+    }
+    
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+if __name__ == "__main__":
+    get_and_transform_data()

From 33933b23012fd36dd2738df17190fd53c6fbbb84 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 11 Mar 2023 10:35:10 +0000
Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data/IEDB_Jespersen_et_al/meta.yaml    | 94 +++++++++++++-------------
 data/IEDB_Jespersen_et_al/transform.py | 85 ++++++++++++-----------
 data/PDB_Jespersen_et_al/meta.yaml     | 94 +++++++++++++-------------
 data/PDB_Jespersen_et_al/transform.py  | 91 +++++++++++++------------
 4 files changed, 185 insertions(+), 179 deletions(-)

diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml
index 9d39d1c3f..70430fb05 100644
--- a/data/IEDB_Jespersen_et_al/meta.yaml
+++ b/data/IEDB_Jespersen_et_al/meta.yaml
@@ -1,54 +1,54 @@
+---
 name: IEDB_Jespersen_et_al
-description: Epitope prediction is to predict the active region in the antigen. This
-  dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
-  epitopes and non-epitope amino acids determined from crystal structures.
+description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects
+    B-cell epitopes and non-epitope amino acids determined from crystal structures.
 targets:
-- id: active_position
-  description: amino acids sequence position that is active in binding
-  units: ''
-  type: Other
-  names:
-  - amino acids sequence active in binding
-  - Epitope
-  uris:
-  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+    - id: active_position
+      description: amino acids sequence position that is active in binding
+      units: ''
+      type: Other
+      names:
+          - amino acids sequence active in binding
+          - Epitope
+      uris:
+          - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
 identifiers:
-- id: Antigen_sequence
-  type: Other
-  description: amino acid sequence
+    - id: Antigen_sequence
+      type: Other
+      description: amino acid sequence
 license: CC BY 4.0
 links:
-- url: https://doi.org/10.1093/nar/gky1006
-  description: corresponding publication
-- url: https://doi.org/10.1093/nar/gkx346
-  description: corresponding publication
-- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
-  description: data source
+    - url: https://doi.org/10.1093/nar/gky1006
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/gkx346
+      description: corresponding publication
+    - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
+      description: data source
 num_points: 3159
 bibtex:
-- |-
-  @article{Vita2018,
-                doi = {10.1093/nar/gky1006},
-                url = {https://doi.org/10.1093/nar/gky1006},
-                year = {2018},
-                month = oct,
-                publisher = {Oxford University Press ({OUP})},
-                volume = {47},
-                number = {D1},
-                pages = {D339--D343},
-                author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
-                title = {The Immune Epitope Database ({IEDB}): 2018 update},
-                journal = {Nucleic Acids Research}}
-- |-
-  @article{Jespersen2017,
-                doi = {10.1093/nar/gkx346},
-                url = {https://doi.org/10.1093/nar/gkx346},
-                year = {2017},
-                month = may,
-                publisher = {Oxford University Press ({OUP})},
-                volume = {45},
-                number = {W1},
-                pages = {W24--W29},
-                author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-                title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-                journal = {Nucleic Acids Research}}
+    - |-
+      @article{Vita2018,
+                    doi = {10.1093/nar/gky1006},
+                    url = {https://doi.org/10.1093/nar/gky1006},
+                    year = {2018},
+                    month = oct,
+                    publisher = {Oxford University Press ({OUP})},
+                    volume = {47},
+                    number = {D1},
+                    pages = {D339--D343},
+                    author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+                    title = {The Immune Epitope Database ({IEDB}): 2018 update},
+                    journal = {Nucleic Acids Research}}
+    - |-
+      @article{Jespersen2017,
+                    doi = {10.1093/nar/gkx346},
+                    url = {https://doi.org/10.1093/nar/gkx346},
+                    year = {2017},
+                    month = may,
+                    publisher = {Oxford University Press ({OUP})},
+                    volume = {45},
+                    number = {W1},
+                    pages = {W24--W29},
+                    author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+                    title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+                    journal = {Nucleic Acids Research}}
diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py
index 0308c54f5..9dfa986b3 100644
--- a/data/IEDB_Jespersen_et_al/transform.py
+++ b/data/IEDB_Jespersen_et_al/transform.py
@@ -2,61 +2,65 @@
 import yaml
 from tdc.single_pred import Epitope
 
+
 def get_and_transform_data():
     # get raw data
-    target_folder = 'IEDB_Jespersen_et_al'
-    target_subfolder = 'IEDB_Jespersen'
-    data = Epitope(name = target_subfolder)
+    target_folder = "IEDB_Jespersen_et_al"
+    target_subfolder = "IEDB_Jespersen"
+    data = Epitope(name=target_subfolder)
+
     def get_active_position(seq, active_poisition, sequence_only=False):
-        '''
-        Input: given a sequence and list of active index 
+        """
+        Input: given a sequence and list of active index
         Output: return active sequence and other sequence convert to _
         MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
-        '''
-        if sequence_only: 
-            _seq = ''.join([seq[x] for x in active_poisition])
+        """
+        if sequence_only:
+            _seq = "".join([seq[x] for x in active_poisition])
             return _seq
-        _seq = ['_' for a in range(len(seq))]
+        _seq = ["_" for a in range(len(seq))]
         for x in active_poisition:
-            _seq[x] = seq[x]    
-        _seq = ''.join(_seq)
+            _seq[x] = seq[x]
+        _seq = "".join(_seq)
         return _seq
 
-    df = pd.read_pickle('data/iedb_jespersen.pkl')
+    df = pd.read_pickle("data/iedb_jespersen.pkl")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['ID', 'X', 'Y']
- 
-    
-    #Rename columns of raw data
-    fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices']
+    assert fields_orig == ["ID", "X", "Y"]
+
+    # Rename columns of raw data
+    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
     df.columns = fields_clean
-    
-    #get active position
+
+    # get active position
     antigen_seq = df.Antigen_sequence.tolist()
     a_pos_ind_list = df.active_positions_indices.tolist()
-    df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)]
-    
+    df["active_position"] = [
+        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+    ]
+
     # save data to original
-    fn_data_original = 'data_original.csv'
-    df.to_csv(fn_data_original,index=None)
-    df = pd.read_csv(fn_data_original, sep=',')
+    fn_data_original = "data_original.csv"
+    df.to_csv(fn_data_original, index=None)
+    df = pd.read_csv(fn_data_original, sep=",")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['Antigen_ID',
-     'Antigen_sequence',
-     'active_positions_indices',
-     'active_position']
-    
+    assert fields_orig == [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "active_position",
+    ]
+
     # get right columns
-    
-    df = df[['Antigen_sequence', 'active_position']]
-    fields_clean = ['Antigen_sequence', 'active_position']
+
+    df = df[["Antigen_sequence", "active_position"]]
+    fields_clean = ["Antigen_sequence", "active_position"]
     df.columns = fields_clean
     assert fields_orig != fields_clean
     assert not df.duplicated().sum()
     # save to csv
     fn_data_csv = "data_clean.csv"
     df.to_csv(fn_data_csv, index=False)
- 
 
     meta = {
         "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
@@ -69,9 +73,9 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 "type": "Other",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
                     "amino acids sequence active in binding",
-                    "Epitope"
+                    "Epitope",
                 ],
-                "uris":[
+                "uris": [
                     "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
                 ],
             }
@@ -96,7 +100,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
             {
                 "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al",
                 "description": "data source",
-            }
+            },
         ],
         "num_points": len(df),  # number of datapoints in this dataset
         "bibtex": [
@@ -111,8 +115,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
               pages = {D339--D343},
               author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
               title = {The Immune Epitope Database ({IEDB}): 2018 update},
-              journal = {Nucleic Acids Research}}""", 
-
+              journal = {Nucleic Acids Research}}""",
             """@article{Jespersen2017,
               doi = {10.1093/nar/gkx346},
               url = {https://doi.org/10.1093/nar/gkx346},
@@ -124,11 +127,10 @@ def get_active_position(seq, active_poisition, sequence_only=False):
               pages = {W24--W29},
               author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
               title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-              journal = {Nucleic Acids Research}}""", 
-
+              journal = {Nucleic Acids Research}}""",
         ],
     }
-    
+
     def str_presenter(dumper, data):
         """configures yaml for dumping multiline strings
         Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
@@ -147,5 +149,6 @@ def str_presenter(dumper, data):
 
     print(f"Finished processing {meta['name']} dataset!")
 
+
 if __name__ == "__main__":
     get_and_transform_data()
diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml
index d771ee0e4..4a0a78b54 100644
--- a/data/PDB_Jespersen_et_al/meta.yaml
+++ b/data/PDB_Jespersen_et_al/meta.yaml
@@ -1,54 +1,54 @@
+---
 name: PDB_Jespersen_et_al
-description: Epitope prediction is to predict the active region in the antigen. This
-  dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes
-  and non-epitope amino acids determined from crystal structures.
+description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects
+    B-cell epitopes and non-epitope amino acids determined from crystal structures.
 targets:
-- id: active_position
-  description: ''
-  units: amino acids sequence position that is active in binding
-  type: Other
-  names:
-  - amino acids sequence active in binding
-  - Epitope
-  uris:
-  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+    - id: active_position
+      description: ''
+      units: amino acids sequence position that is active in binding
+      type: Other
+      names:
+          - amino acids sequence active in binding
+          - Epitope
+      uris:
+          - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
 identifiers:
-- id: Antigen_sequence
-  type: Other
-  description: amino acid sequence
+    - id: Antigen_sequence
+      type: Other
+      description: amino acid sequence
 license: CC BY 4.0
 links:
-- url: https://doi.org/10.1093/nar/gkx346
-  description: corresponding publication
-- url: https://doi.org/10.1093/nar/28.1.235
-  description: corresponding publication
-- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
-  description: data source
+    - url: https://doi.org/10.1093/nar/gkx346
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/28.1.235
+      description: corresponding publication
+    - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
+      description: data source
 num_points: 447
 bibtex:
-- |-
-  @article{Jespersen2017,
-                doi = {10.1093/nar/gkx346},
-                url = {https://doi.org/10.1093/nar/gkx346},
-                year = {2017},
-                month = may,
-                publisher = {Oxford University Press ({OUP})},
-                volume = {45},
-                number = {W1},
-                pages = {W24--W29},
-                author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-                title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-                journal = {Nucleic Acids Research}}
-- |-
-  @article{Berman2000,
-                doi = {10.1093/nar/28.1.235},
-                url = {https://doi.org/10.1093/nar/28.1.235},
-                year = {2000},
-                month = jan,
-                publisher = {Oxford University Press ({OUP})},
-                volume = {28},
-                number = {1},
-                pages = {235--242},
-                author = {H. M. Berman},
-                title = {The Protein Data Bank},
-                journal = {Nucleic Acids Research}}
+    - |-
+      @article{Jespersen2017,
+                    doi = {10.1093/nar/gkx346},
+                    url = {https://doi.org/10.1093/nar/gkx346},
+                    year = {2017},
+                    month = may,
+                    publisher = {Oxford University Press ({OUP})},
+                    volume = {45},
+                    number = {W1},
+                    pages = {W24--W29},
+                    author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+                    title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
+                    journal = {Nucleic Acids Research}}
+    - |-
+      @article{Berman2000,
+                    doi = {10.1093/nar/28.1.235},
+                    url = {https://doi.org/10.1093/nar/28.1.235},
+                    year = {2000},
+                    month = jan,
+                    publisher = {Oxford University Press ({OUP})},
+                    volume = {28},
+                    number = {1},
+                    pages = {235--242},
+                    author = {H. M. Berman},
+                    title = {The Protein Data Bank},
+                    journal = {Nucleic Acids Research}}
diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
index afb592343..128ec11b5 100644
--- a/data/PDB_Jespersen_et_al/transform.py
+++ b/data/PDB_Jespersen_et_al/transform.py
@@ -2,61 +2,66 @@
 import yaml
 from tdc.single_pred import Epitope
 
+
 def get_and_transform_data():
     # get raw data
-    target_folder = 'PDB_Jespersen_et_al'
-    target_subfolder = 'PDB_Jespersen'
-    data = Epitope(name = target_subfolder)
-    
+    target_folder = "PDB_Jespersen_et_al"
+    target_subfolder = "PDB_Jespersen"
+    data = Epitope(name=target_subfolder)
+
     def get_active_position(seq, active_poisition, sequence_only=False):
-        '''
-        Input: given a sequence and list of active index 
+        """
+        Input: given a sequence and list of active index
         Output: return active sequence and other sequence convert to _
         MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
-        '''
-        if sequence_only: 
-            _seq = ''.join([seq[x] for x in active_poisition])
+        """
+        if sequence_only:
+            _seq = "".join([seq[x] for x in active_poisition])
             return _seq
-        _seq = ['_' for a in range(len(seq))]
+        _seq = ["_" for a in range(len(seq))]
         for x in active_poisition:
-            _seq[x] = seq[x]    
-        _seq = ''.join(_seq)
+            _seq[x] = seq[x]
+        _seq = "".join(_seq)
         return _seq
 
-    df = pd.read_pickle('data/pdb_jespersen.pkl')
+    df = pd.read_pickle("data/pdb_jespersen.pkl")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['ID', 'X', 'Y']
-    
-    #Rename columns of raw data
-    fields_clean = ['Antigen_ID', 'Antigen_sequence', 'active_positions_indices']
+    assert fields_orig == ["ID", "X", "Y"]
+
+    # Rename columns of raw data
+    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
     df.columns = fields_clean
-    
-    #get active position
+
+    # get active position
     antigen_seq = df.Antigen_sequence.tolist()
     a_pos_ind_list = df.active_positions_indices.tolist()
-    df['active_position'] = [get_active_position(x,o) for x,o in zip(antigen_seq, a_pos_ind_list)]
-    
+    df["active_position"] = [
+        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+    ]
+
     # save data to original
-    fn_data_original = 'data_original.csv'
-    df.to_csv(fn_data_original,index=None)
-    df = pd.read_csv(fn_data_original, sep=',')
+    fn_data_original = "data_original.csv"
+    df.to_csv(fn_data_original, index=None)
+    df = pd.read_csv(fn_data_original, sep=",")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['Antigen_ID',
-     'Antigen_sequence',
-     'active_positions_indices',
-     'active_position']
-    
+    assert fields_orig == [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "active_position",
+    ]
+
     # get right columns
-    
-    df = df[['Antigen_sequence', 'active_position']]
-    fields_clean = ['Antigen_sequence', 'active_position']
+
+    df = df[["Antigen_sequence", "active_position"]]
+    fields_clean = ["Antigen_sequence", "active_position"]
     df.columns = fields_clean
     assert fields_orig != fields_clean
     assert not df.duplicated().sum()
     # save to csv
     fn_data_csv = "data_clean.csv"
     df.to_csv(fn_data_csv, index=False)
-    
+
     meta = {
         "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
         "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
@@ -68,14 +73,13 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 "type": "Other",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
                     "amino acids sequence active in binding",
-                    "Epitope"
+                    "Epitope",
                 ],
-                "uris":[
+                "uris": [
                     "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
                 ],
             }
         ],
-
         "identifiers": [
             {
                 "id": "Antigen_sequence",  # column name
@@ -96,11 +100,11 @@ def get_active_position(seq, active_poisition, sequence_only=False):
             {
                 "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al",
                 "description": "data source",
-            }
+            },
         ],
         "num_points": len(df),  # number of datapoints in this dataset
         "bibtex": [
-        """@article{Jespersen2017,
+            """@article{Jespersen2017,
               doi = {10.1093/nar/gkx346},
               url = {https://doi.org/10.1093/nar/gkx346},
               year = {2017},
@@ -111,9 +115,8 @@ def get_active_position(seq, active_poisition, sequence_only=False):
               pages = {W24--W29},
               author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
               title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-              journal = {Nucleic Acids Research}}""", 
-
-        """@article{Berman2000,
+              journal = {Nucleic Acids Research}}""",
+            """@article{Berman2000,
               doi = {10.1093/nar/28.1.235},
               url = {https://doi.org/10.1093/nar/28.1.235},
               year = {2000},
@@ -124,11 +127,10 @@ def get_active_position(seq, active_poisition, sequence_only=False):
               pages = {235--242},
               author = {H. M. Berman},
               title = {The Protein Data Bank},
-              journal = {Nucleic Acids Research}}""", 
-
+              journal = {Nucleic Acids Research}}""",
         ],
     }
-    
+
     def str_presenter(dumper, data):
         """configures yaml for dumping multiline strings
         Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
@@ -147,5 +149,6 @@ def str_presenter(dumper, data):
 
     print(f"Finished processing {meta['name']} dataset!")
 
+
 if __name__ == "__main__":
     get_and_transform_data()

From 334a832d8f59cc8cb12ae93ed050948a02cf6ed4 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:33:12 +0200
Subject: [PATCH 03/13] Update transform.py

---
 data/PDB_Jespersen_et_al/transform.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
index 128ec11b5..a31702593 100644
--- a/data/PDB_Jespersen_et_al/transform.py
+++ b/data/PDB_Jespersen_et_al/transform.py
@@ -64,7 +64,9 @@ def get_active_position(seq, active_poisition, sequence_only=False):
 
     meta = {
         "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
-        "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
+        "description": """Epitope prediction is to predict the active region in the antigen. 
+        This dataset is from Bepipred, which curates a dataset from PDB.
+        It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
         "targets": [
             {
                 "id": "active_position",  # name of the column in a tabular dataset

From 3bb5d0e4e795c66d5e5d05fead5b824831548ae6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 11 Mar 2023 14:33:19 +0000
Subject: [PATCH 04/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data/PDB_Jespersen_et_al/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
index a31702593..dda7f298b 100644
--- a/data/PDB_Jespersen_et_al/transform.py
+++ b/data/PDB_Jespersen_et_al/transform.py
@@ -64,7 +64,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
 
     meta = {
         "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
-        "description": """Epitope prediction is to predict the active region in the antigen. 
+        "description": """Epitope prediction is to predict the active region in the antigen.
         This dataset is from Bepipred, which curates a dataset from PDB.
         It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
         "targets": [

From ac9dd66d1e67df0adce220225211426d3d9e74f5 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:37:23 +0200
Subject: [PATCH 05/13] Update transform.py

---
 data/PDB_Jespersen_et_al/transform.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
index dda7f298b..ec27020e0 100644
--- a/data/PDB_Jespersen_et_al/transform.py
+++ b/data/PDB_Jespersen_et_al/transform.py
@@ -70,15 +70,15 @@ def get_active_position(seq, active_poisition, sequence_only=False):
         "targets": [
             {
                 "id": "active_position",  # name of the column in a tabular dataset
-                "description": "",  # description of what this column means
-                "units": "amino acids sequence position that is active in binding",  # units of the values in this column (leave empty if unitless)
+                "description": "amino acids sequence position that is active in binding",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "Other",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
                     "amino acids sequence active in binding",
                     "Epitope",
                 ],
                 "uris": [
-                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
+                    "https://rb.gy/l1st1c",
                 ],
             }
         ],

From 85e95a2eef60764fdc0dc24f33e655be2ef329b2 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Mon, 27 Mar 2023 19:37:23 +0200
Subject: [PATCH 06/13] Delete data/IEDB_Jespersen_et_al directory

---
 data/IEDB_Jespersen_et_al/meta.yaml    |  54 ---------
 data/IEDB_Jespersen_et_al/transform.py | 154 -------------------------
 2 files changed, 208 deletions(-)
 delete mode 100644 data/IEDB_Jespersen_et_al/meta.yaml
 delete mode 100644 data/IEDB_Jespersen_et_al/transform.py

diff --git a/data/IEDB_Jespersen_et_al/meta.yaml b/data/IEDB_Jespersen_et_al/meta.yaml
deleted file mode 100644
index 70430fb05..000000000
--- a/data/IEDB_Jespersen_et_al/meta.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-name: IEDB_Jespersen_et_al
-description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects
-    B-cell epitopes and non-epitope amino acids determined from crystal structures.
-targets:
-    - id: active_position
-      description: amino acids sequence position that is active in binding
-      units: ''
-      type: Other
-      names:
-          - amino acids sequence active in binding
-          - Epitope
-      uris:
-          - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
-identifiers:
-    - id: Antigen_sequence
-      type: Other
-      description: amino acid sequence
-license: CC BY 4.0
-links:
-    - url: https://doi.org/10.1093/nar/gky1006
-      description: corresponding publication
-    - url: https://doi.org/10.1093/nar/gkx346
-      description: corresponding publication
-    - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
-      description: data source
-num_points: 3159
-bibtex:
-    - |-
-      @article{Vita2018,
-                    doi = {10.1093/nar/gky1006},
-                    url = {https://doi.org/10.1093/nar/gky1006},
-                    year = {2018},
-                    month = oct,
-                    publisher = {Oxford University Press ({OUP})},
-                    volume = {47},
-                    number = {D1},
-                    pages = {D339--D343},
-                    author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
-                    title = {The Immune Epitope Database ({IEDB}): 2018 update},
-                    journal = {Nucleic Acids Research}}
-    - |-
-      @article{Jespersen2017,
-                    doi = {10.1093/nar/gkx346},
-                    url = {https://doi.org/10.1093/nar/gkx346},
-                    year = {2017},
-                    month = may,
-                    publisher = {Oxford University Press ({OUP})},
-                    volume = {45},
-                    number = {W1},
-                    pages = {W24--W29},
-                    author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-                    title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-                    journal = {Nucleic Acids Research}}
diff --git a/data/IEDB_Jespersen_et_al/transform.py b/data/IEDB_Jespersen_et_al/transform.py
deleted file mode 100644
index 9dfa986b3..000000000
--- a/data/IEDB_Jespersen_et_al/transform.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import pandas as pd
-import yaml
-from tdc.single_pred import Epitope
-
-
-def get_and_transform_data():
-    # get raw data
-    target_folder = "IEDB_Jespersen_et_al"
-    target_subfolder = "IEDB_Jespersen"
-    data = Epitope(name=target_subfolder)
-
-    def get_active_position(seq, active_poisition, sequence_only=False):
-        """
-        Input: given a sequence and list of active index
-        Output: return active sequence and other sequence convert to _
-        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
-        """
-        if sequence_only:
-            _seq = "".join([seq[x] for x in active_poisition])
-            return _seq
-        _seq = ["_" for a in range(len(seq))]
-        for x in active_poisition:
-            _seq[x] = seq[x]
-        _seq = "".join(_seq)
-        return _seq
-
-    df = pd.read_pickle("data/iedb_jespersen.pkl")
-    fields_orig = df.columns.tolist()
-    assert fields_orig == ["ID", "X", "Y"]
-
-    # Rename columns of raw data
-    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
-    df.columns = fields_clean
-
-    # get active position
-    antigen_seq = df.Antigen_sequence.tolist()
-    a_pos_ind_list = df.active_positions_indices.tolist()
-    df["active_position"] = [
-        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
-    ]
-
-    # save data to original
-    fn_data_original = "data_original.csv"
-    df.to_csv(fn_data_original, index=None)
-    df = pd.read_csv(fn_data_original, sep=",")
-    fields_orig = df.columns.tolist()
-    assert fields_orig == [
-        "Antigen_ID",
-        "Antigen_sequence",
-        "active_positions_indices",
-        "active_position",
-    ]
-
-    # get right columns
-
-    df = df[["Antigen_sequence", "active_position"]]
-    fields_clean = ["Antigen_sequence", "active_position"]
-    df.columns = fields_clean
-    assert fields_orig != fields_clean
-    assert not df.duplicated().sum()
-    # save to csv
-    fn_data_csv = "data_clean.csv"
-    df.to_csv(fn_data_csv, index=False)
-
-    meta = {
-        "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
-        "description": """Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
-        "targets": [
-            {
-                "id": "active_position",  # name of the column in a tabular dataset
-                "description": "amino acids sequence position that is active in binding",  # description of what this column means
-                "units": "",  # units of the values in this column (leave empty if unitless)
-                "type": "Other",  # can be "categorical", "ordinal", "continuous"
-                "names": [  # names for the property (to sample from for building the prompts)
-                    "amino acids sequence active in binding",
-                    "Epitope",
-                ],
-                "uris": [
-                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
-                ],
-            }
-        ],
-        "identifiers": [
-            {
-                "id": "Antigen_sequence",  # column name
-                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
-                "description": "amino acid sequence",  # description (optional, except for "Other")
-            }
-        ],
-        "license": "CC BY 4.0",  # license under which the original dataset was published
-        "links": [  # list of relevant links (original dataset, other uses, etc.)
-            {
-                "url": "https://doi.org/10.1093/nar/gky1006",
-                "description": "corresponding publication",
-            },
-            {
-                "url": "https://doi.org/10.1093/nar/gkx346",
-                "description": "corresponding publication",
-            },
-            {
-                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al",
-                "description": "data source",
-            },
-        ],
-        "num_points": len(df),  # number of datapoints in this dataset
-        "bibtex": [
-            """@article{Vita2018,
-              doi = {10.1093/nar/gky1006},
-              url = {https://doi.org/10.1093/nar/gky1006},
-              year = {2018},
-              month = oct,
-              publisher = {Oxford University Press ({OUP})},
-              volume = {47},
-              number = {D1},
-              pages = {D339--D343},
-              author = {Randi Vita and Swapnil Mahajan and James A Overton and Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
-              title = {The Immune Epitope Database ({IEDB}): 2018 update},
-              journal = {Nucleic Acids Research}}""",
-            """@article{Jespersen2017,
-              doi = {10.1093/nar/gkx346},
-              url = {https://doi.org/10.1093/nar/gkx346},
-              year = {2017},
-              month = may,
-              publisher = {Oxford University Press ({OUP})},
-              volume = {45},
-              number = {W1},
-              pages = {W24--W29},
-              author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-              title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-              journal = {Nucleic Acids Research}}""",
-        ],
-    }
-
-    def str_presenter(dumper, data):
-        """configures yaml for dumping multiline strings
-        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
-        """
-        if data.count("\n") > 0:  # check for multiline string
-            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-    yaml.add_representer(str, str_presenter)
-    yaml.representer.SafeRepresenter.add_representer(
-        str, str_presenter
-    )  # to use with safe_dum
-    fn_meta = "meta.yaml"
-    with open(fn_meta, "w") as f:
-        yaml.dump(meta, f, sort_keys=False)
-
-    print(f"Finished processing {meta['name']} dataset!")
-
-
-if __name__ == "__main__":
-    get_and_transform_data()

From bd64e79aa04d4c976dcf2bd3fc12bad6b90b9f27 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Mon, 27 Mar 2023 19:37:31 +0200
Subject: [PATCH 07/13] Delete data/PDB_Jespersen_et_al directory

---
 data/PDB_Jespersen_et_al/meta.yaml    |  54 ---------
 data/PDB_Jespersen_et_al/transform.py | 156 --------------------------
 2 files changed, 210 deletions(-)
 delete mode 100644 data/PDB_Jespersen_et_al/meta.yaml
 delete mode 100644 data/PDB_Jespersen_et_al/transform.py

diff --git a/data/PDB_Jespersen_et_al/meta.yaml b/data/PDB_Jespersen_et_al/meta.yaml
deleted file mode 100644
index 4a0a78b54..000000000
--- a/data/PDB_Jespersen_et_al/meta.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-name: PDB_Jespersen_et_al
-description: Epitope prediction is to predict the active region in the antigen. This dataset is from Bepipred, which curates a dataset from PDB. It collects
-    B-cell epitopes and non-epitope amino acids determined from crystal structures.
-targets:
-    - id: active_position
-      description: ''
-      units: amino acids sequence position that is active in binding
-      type: Other
-      names:
-          - amino acids sequence active in binding
-          - Epitope
-      uris:
-          - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
-identifiers:
-    - id: Antigen_sequence
-      type: Other
-      description: amino acid sequence
-license: CC BY 4.0
-links:
-    - url: https://doi.org/10.1093/nar/gkx346
-      description: corresponding publication
-    - url: https://doi.org/10.1093/nar/28.1.235
-      description: corresponding publication
-    - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
-      description: data source
-num_points: 447
-bibtex:
-    - |-
-      @article{Jespersen2017,
-                    doi = {10.1093/nar/gkx346},
-                    url = {https://doi.org/10.1093/nar/gkx346},
-                    year = {2017},
-                    month = may,
-                    publisher = {Oxford University Press ({OUP})},
-                    volume = {45},
-                    number = {W1},
-                    pages = {W24--W29},
-                    author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-                    title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-                    journal = {Nucleic Acids Research}}
-    - |-
-      @article{Berman2000,
-                    doi = {10.1093/nar/28.1.235},
-                    url = {https://doi.org/10.1093/nar/28.1.235},
-                    year = {2000},
-                    month = jan,
-                    publisher = {Oxford University Press ({OUP})},
-                    volume = {28},
-                    number = {1},
-                    pages = {235--242},
-                    author = {H. M. Berman},
-                    title = {The Protein Data Bank},
-                    journal = {Nucleic Acids Research}}
diff --git a/data/PDB_Jespersen_et_al/transform.py b/data/PDB_Jespersen_et_al/transform.py
deleted file mode 100644
index ec27020e0..000000000
--- a/data/PDB_Jespersen_et_al/transform.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import pandas as pd
-import yaml
-from tdc.single_pred import Epitope
-
-
-def get_and_transform_data():
-    # get raw data
-    target_folder = "PDB_Jespersen_et_al"
-    target_subfolder = "PDB_Jespersen"
-    data = Epitope(name=target_subfolder)
-
-    def get_active_position(seq, active_poisition, sequence_only=False):
-        """
-        Input: given a sequence and list of active index
-        Output: return active sequence and other sequence convert to _
-        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
-        """
-        if sequence_only:
-            _seq = "".join([seq[x] for x in active_poisition])
-            return _seq
-        _seq = ["_" for a in range(len(seq))]
-        for x in active_poisition:
-            _seq[x] = seq[x]
-        _seq = "".join(_seq)
-        return _seq
-
-    df = pd.read_pickle("data/pdb_jespersen.pkl")
-    fields_orig = df.columns.tolist()
-    assert fields_orig == ["ID", "X", "Y"]
-
-    # Rename columns of raw data
-    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
-    df.columns = fields_clean
-
-    # get active position
-    antigen_seq = df.Antigen_sequence.tolist()
-    a_pos_ind_list = df.active_positions_indices.tolist()
-    df["active_position"] = [
-        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
-    ]
-
-    # save data to original
-    fn_data_original = "data_original.csv"
-    df.to_csv(fn_data_original, index=None)
-    df = pd.read_csv(fn_data_original, sep=",")
-    fields_orig = df.columns.tolist()
-    assert fields_orig == [
-        "Antigen_ID",
-        "Antigen_sequence",
-        "active_positions_indices",
-        "active_position",
-    ]
-
-    # get right columns
-
-    df = df[["Antigen_sequence", "active_position"]]
-    fields_clean = ["Antigen_sequence", "active_position"]
-    df.columns = fields_clean
-    assert fields_orig != fields_clean
-    assert not df.duplicated().sum()
-    # save to csv
-    fn_data_csv = "data_clean.csv"
-    df.to_csv(fn_data_csv, index=False)
-
-    meta = {
-        "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
-        "description": """Epitope prediction is to predict the active region in the antigen.
-        This dataset is from Bepipred, which curates a dataset from PDB.
-        It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
-        "targets": [
-            {
-                "id": "active_position",  # name of the column in a tabular dataset
-                "description": "amino acids sequence position that is active in binding",  # description of what this column means
-                "units": "",  # units of the values in this column (leave empty if unitless)
-                "type": "Other",  # can be "categorical", "ordinal", "continuous"
-                "names": [  # names for the property (to sample from for building the prompts)
-                    "amino acids sequence active in binding",
-                    "Epitope",
-                ],
-                "uris": [
-                    "https://rb.gy/l1st1c",
-                ],
-            }
-        ],
-        "identifiers": [
-            {
-                "id": "Antigen_sequence",  # column name
-                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
-                "description": "amino acid sequence",  # description (optional, except for "Other")
-            }
-        ],
-        "license": "CC BY 4.0",  # license under which the original dataset was published
-        "links": [  # list of relevant links (original dataset, other uses, etc.)
-            {
-                "url": "https://doi.org/10.1093/nar/gkx346",
-                "description": "corresponding publication",
-            },
-            {
-                "url": "https://doi.org/10.1093/nar/28.1.235",
-                "description": "corresponding publication",
-            },
-            {
-                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al",
-                "description": "data source",
-            },
-        ],
-        "num_points": len(df),  # number of datapoints in this dataset
-        "bibtex": [
-            """@article{Jespersen2017,
-              doi = {10.1093/nar/gkx346},
-              url = {https://doi.org/10.1093/nar/gkx346},
-              year = {2017},
-              month = may,
-              publisher = {Oxford University Press ({OUP})},
-              volume = {45},
-              number = {W1},
-              pages = {W24--W29},
-              author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-              title = {{BepiPred}-2.0: improving sequence-based B-cell epitope prediction using conformational epitopes},
-              journal = {Nucleic Acids Research}}""",
-            """@article{Berman2000,
-              doi = {10.1093/nar/28.1.235},
-              url = {https://doi.org/10.1093/nar/28.1.235},
-              year = {2000},
-              month = jan,
-              publisher = {Oxford University Press ({OUP})},
-              volume = {28},
-              number = {1},
-              pages = {235--242},
-              author = {H. M. Berman},
-              title = {The Protein Data Bank},
-              journal = {Nucleic Acids Research}}""",
-        ],
-    }
-
-    def str_presenter(dumper, data):
-        """configures yaml for dumping multiline strings
-        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
-        """
-        if data.count("\n") > 0:  # check for multiline string
-            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-    yaml.add_representer(str, str_presenter)
-    yaml.representer.SafeRepresenter.add_representer(
-        str, str_presenter
-    )  # to use with safe_dum
-    fn_meta = "meta.yaml"
-    with open(fn_meta, "w") as f:
-        yaml.dump(meta, f, sort_keys=False)
-
-    print(f"Finished processing {meta['name']} dataset!")
-
-
-if __name__ == "__main__":
-    get_and_transform_data()

From 57e301c64dfa0a8ce0c1c3a426d46518bd5e0175 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Mon, 27 Mar 2023 19:37:49 +0200
Subject: [PATCH 08/13] Add files via upload

---
 data/iedb_jespersen_et_al/meta.yaml    |  67 ++++++++++
 data/iedb_jespersen_et_al/transform.py | 172 +++++++++++++++++++++++++
 data/pdb_jespersen_et_al/meta.yaml     |  65 ++++++++++
 data/pdb_jespersen_et_al/transform.py  | 170 ++++++++++++++++++++++++
 4 files changed, 474 insertions(+)
 create mode 100644 data/iedb_jespersen_et_al/meta.yaml
 create mode 100644 data/iedb_jespersen_et_al/transform.py
 create mode 100644 data/pdb_jespersen_et_al/meta.yaml
 create mode 100644 data/pdb_jespersen_et_al/transform.py

diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml
new file mode 100644
index 000000000..282cc264b
--- /dev/null
+++ b/data/iedb_jespersen_et_al/meta.yaml
@@ -0,0 +1,67 @@
+name: iedb_jespersen_et_al
+description: |-
+  Epitope prediction is to predict the active region in the antigen.
+  This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
+  epitopes and non-epitope amino acids determined from crystal structures.
+targets:
+- id: active_position
+  description: amino acids sequence position that is active in binding
+  units: ''
+  type: categorical
+  names:
+  - amino acids sequence active in binding
+  - Epitope
+  uris:
+  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+benchmarks:
+- name: TDC
+  link: https://tdcommons.ai/
+  split_column: split
+identifiers:
+- id: Antigen_sequence
+  type: Other
+  names:
+  - amino acid sequence
+  - FASTQ
+  - fastq sequence
+  - Protien sequence
+  description: amino acid sequence
+license: CC BY 4.0
+links:
+- url: https://doi.org/10.1093/nar/gky1006
+  description: corresponding publication
+- url: https://doi.org/10.1093/nar/gkx346
+  description: corresponding publication
+- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
+  description: data source
+num_points: 3159
+bibtex:
+- |-
+  @article{Vita2018,
+  doi = {10.1093/nar/gky1006},
+  url = {https://doi.org/10.1093/nar/gky1006},
+  year = {2018},
+  month = oct,
+  publisher = {Oxford University Press (OUP)},
+  volume = {47},
+  number = {D1},
+  pages = {D339--D343}},
+  author = {Randi Vita and Swapnil Mahajan and James A Overton and
+  Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
+  Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+  title = {The Immune Epitope Database (IEDB): 2018 update},
+  journal = {Nucleic Acids Research}
+- |-
+  @article{Jespersen2017,
+  doi = {10.1093/nar/gkx346},
+  url = {https://doi.org/10.1093/nar/gkx346},
+  year = {2017},
+  month = may,
+  publisher = {Oxford University Press (OUP)},
+  volume = {45},
+  number = {W1},
+  pages = {W24--W29},
+  author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+  title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+  conformational epitopes},
+  journal = {Nucleic Acids Research}
diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py
new file mode 100644
index 000000000..e9a0cd0df
--- /dev/null
+++ b/data/iedb_jespersen_et_al/transform.py
@@ -0,0 +1,172 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Epitope
+
+
+def get_and_transform_data():
+    # get raw data
+    target_folder = "IEDB_Jespersen_et_al"
+    target_subfolder = "IEDB_Jespersen"
+    data = Epitope(name=target_subfolder)
+
+    def get_active_position(seq, active_poisition, sequence_only=False):
+        """
+        Input: given a sequence and list of active index
+        Output: return active sequence and other sequence convert to _
+        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
+        """
+        if sequence_only:
+            _seq = "".join([seq[x] for x in active_poisition])
+            return _seq
+        _seq = ["_" for a in range(len(seq))]
+        for x in active_poisition:
+            _seq[x] = seq[x]
+        _seq = "".join(_seq)
+        return _seq
+
+    df = pd.read_pickle("data/iedb_jespersen.pkl")
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ["ID", "X", "Y"]
+
+    # Rename columns of raw data
+    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
+    df.columns = fields_clean
+
+    # get active position
+    antigen_seq = df.Antigen_sequence.tolist()
+    a_pos_ind_list = df.active_positions_indices.tolist()
+    df["active_position"] = [
+        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+    ]
+
+    # save data to original
+    fn_data_original = "data_original.csv"
+    df.to_csv(fn_data_original, index=None)
+    df = pd.read_csv(fn_data_original, sep=",")
+    fields_orig = df.columns.tolist()
+    assert fields_orig == [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "active_position",
+    ]
+
+    # get right columns
+
+    df = df[["Antigen_sequence", "active_position"]]
+    fields_clean = ["Antigen_sequence", "active_position"]
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+    assert not df.duplicated().sum()
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    meta = {
+        "name": "iedb_jespersen_et_al",  # unique identifier, we will also use this for directory names
+        "description": """Epitope prediction is to predict the active region in the antigen.
+This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
+epitopes and non-epitope amino acids determined from crystal structures.""",
+        "targets": [
+            {
+                "id": "active_position",  # name of the column in a tabular dataset
+                "description": "amino acids sequence position that is active in binding",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "amino acids sequence active in binding",
+                    "Epitope",
+                ],
+                "uris": [
+                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
+                ],
+            }
+        ],
+        "benchmarks": [
+            {
+            "name": "TDC",  # unique benchmark name
+            "link": "https://tdcommons.ai/",  # benchmark URL
+            "split_column": "split",  # name of the column that contains the split information
+            },
+        ],
+        "identifiers": [
+            {
+                "id": "Antigen_sequence",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "names": [
+                "amino acid sequence",
+                "FASTQ",
+                "fastq sequence",
+                "Protien sequence"
+                ],
+                "description": "amino acid sequence",  # description (optional, except for "Other")
+            }
+        ],
+        "license": "CC BY 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.1093/nar/gky1006",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.1093/nar/gkx346",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al",
+                "description": "data source",
+            },
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{Vita2018,
+doi = {10.1093/nar/gky1006},
+url = {https://doi.org/10.1093/nar/gky1006},
+year = {2018},
+month = oct,
+publisher = {Oxford University Press (OUP)},
+volume = {47},
+number = {D1},
+pages = {D339--D343}},
+author = {Randi Vita and Swapnil Mahajan and James A Overton and
+Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
+Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+title = {The Immune Epitope Database (IEDB): 2018 update},
+journal = {Nucleic Acids Research}""",
+            """@article{Jespersen2017,
+doi = {10.1093/nar/gkx346},
+url = {https://doi.org/10.1093/nar/gkx346},
+year = {2017},
+month = may,
+publisher = {Oxford University Press (OUP)},
+volume = {45},
+number = {W1},
+pages = {W24--W29},
+author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+conformational epitopes},
+journal = {Nucleic Acids Research}""",
+        ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()
diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml
new file mode 100644
index 000000000..4480baf47
--- /dev/null
+++ b/data/pdb_jespersen_et_al/meta.yaml
@@ -0,0 +1,65 @@
+name: pdb_jespersen_et_al
+description: |-
+  Epitope prediction is to predict the active region in the antigen.
+  This dataset is from Bepipred, which curates a dataset from PDB.
+  It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.
+targets:
+- id: active_position
+  description: amino acids sequence position that is active in binding
+  units: ''
+  type: categorical
+  names:
+  - amino acids sequence active in binding
+  - Epitope
+  uris:
+  - https://rb.gy/l1st1c
+benchmarks:
+- name: TDC
+  link: https://tdcommons.ai/
+  split_column: split
+identifiers:
+- id: Antigen_sequence
+  type: Other
+  names:
+  - amino acid sequence
+  - FASTQ
+  - fastq sequence
+  - Protien sequence
+  description: amino acid sequence
+license: CC BY 4.0
+links:
+- url: https://doi.org/10.1093/nar/gkx346
+  description: corresponding publication
+- url: https://doi.org/10.1093/nar/28.1.235
+  description: corresponding publication
+- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
+  description: data source
+num_points: 447
+bibtex:
+- |-
+  @article{Jespersen2017,
+  doi = {10.1093/nar/gkx346},
+  url = {https://doi.org/10.1093/nar/gkx346},
+  year = {2017},
+  month = may,
+  publisher = {Oxford University Press (OUP)},
+  volume = {45},
+  number = {W1},
+  pages = {W24--W29},
+  author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+  title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+  conformational epitopes},
+  journal = {Nucleic Acids Research}
+- |-
+  @article{Berman2000,
+  doi = {10.1093/nar/28.1.235},
+  url = {https://doi.org/10.1093/nar/28.1.235},
+  year = {2000},
+  month = jan,
+  publisher = {Oxford University Press (OUP)},
+  volume = {28},
+  number = {1},
+  pages = {235--242},
+  author = {H. M. Berman},
+  title = {The Protein Data Bank},
+  journal = {Nucleic Acids Research}
diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py
new file mode 100644
index 000000000..3e21c2cad
--- /dev/null
+++ b/data/pdb_jespersen_et_al/transform.py
@@ -0,0 +1,170 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Epitope
+
+
+def get_and_transform_data():
+    # get raw data
+    target_folder = "PDB_Jespersen_et_al"
+    target_subfolder = "PDB_Jespersen"
+    data = Epitope(name=target_subfolder)
+
+    def get_active_position(seq, active_poisition, sequence_only=False):
+        """
+        Input: given a sequence and list of active index
+        Output: return active sequence and other sequence convert to _
+        MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
+        """
+        if sequence_only:
+            _seq = "".join([seq[x] for x in active_poisition])
+            return _seq
+        _seq = ["_" for a in range(len(seq))]
+        for x in active_poisition:
+            _seq[x] = seq[x]
+        _seq = "".join(_seq)
+        return _seq
+
+    df = pd.read_pickle("data/pdb_jespersen.pkl")
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ["ID", "X", "Y"]
+
+    # Rename columns of raw data
+    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
+    df.columns = fields_clean
+
+    # get active position
+    antigen_seq = df.Antigen_sequence.tolist()
+    a_pos_ind_list = df.active_positions_indices.tolist()
+    df["active_position"] = [
+        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+    ]
+
+    # save data to original
+    fn_data_original = "data_original.csv"
+    df.to_csv(fn_data_original, index=None)
+    df = pd.read_csv(fn_data_original, sep=",")
+    fields_orig = df.columns.tolist()
+    assert fields_orig == [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "active_position",
+    ]
+
+    # get right columns
+
+    df = df[["Antigen_sequence", "active_position"]]
+    fields_clean = ["Antigen_sequence", "active_position"]
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+    assert not df.duplicated().sum()
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    meta = {
+        "name": "pdb_jespersen_et_al",  # unique identifier, we will also use this for directory names
+        "description": """Epitope prediction is to predict the active region in the antigen.
+This dataset is from Bepipred, which curates a dataset from PDB.
+It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.""",
+        "targets": [
+            {
+                "id": "active_position",  # name of the column in a tabular dataset
+                "description": "amino acids sequence position that is active in binding",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "amino acids sequence active in binding",
+                    "Epitope",
+                ],
+                "uris": [
+                    "https://rb.gy/l1st1c",
+                ],
+            }
+        ],
+        "benchmarks": [
+        {
+            "name": "TDC",  # unique benchmark name
+            "link": "https://tdcommons.ai/",  # benchmark URL
+            "split_column": "split",  # name of the column that contains the split information
+        },
+        ],
+        "identifiers": [
+            {
+                "id": "Antigen_sequence",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "names": [
+                "amino acid sequence",
+                "FASTQ",
+                "fastq sequence",
+                "Protien sequence"
+                ],
+                "description": "amino acid sequence",  # description (optional, except for "Other")
+            }
+        ],
+        "license": "CC BY 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.1093/nar/gkx346",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.1093/nar/28.1.235",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al",
+                "description": "data source",
+            },
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{Jespersen2017,
+doi = {10.1093/nar/gkx346},
+url = {https://doi.org/10.1093/nar/gkx346},
+year = {2017},
+month = may,
+publisher = {Oxford University Press (OUP)},
+volume = {45},
+number = {W1},
+pages = {W24--W29},
+author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+conformational epitopes},
+journal = {Nucleic Acids Research}""",
+            """@article{Berman2000,
+doi = {10.1093/nar/28.1.235},
+url = {https://doi.org/10.1093/nar/28.1.235},
+year = {2000},
+month = jan,
+publisher = {Oxford University Press (OUP)},
+volume = {28},
+number = {1},
+pages = {235--242},
+author = {H. M. Berman},
+title = {The Protein Data Bank},
+journal = {Nucleic Acids Research}""",
+        ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()

From 56113d9794a40b3fd172ba9c0071501cca8dfc07 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Wed, 29 Mar 2023 03:23:10 +0200
Subject: [PATCH 09/13] Remove benchmark field

As data was complex and get in an indirect way. I didn't implement split
---
 data/iedb_jespersen_et_al/meta.yaml    |  4 ----
 data/iedb_jespersen_et_al/transform.py | 13 +++----------
 data/pdb_jespersen_et_al/meta.yaml     |  4 ----
 data/pdb_jespersen_et_al/transform.py  |  7 -------
 4 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml
index 282cc264b..6d577b3ad 100644
--- a/data/iedb_jespersen_et_al/meta.yaml
+++ b/data/iedb_jespersen_et_al/meta.yaml
@@ -13,10 +13,6 @@ targets:
   - Epitope
   uris:
   - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
-benchmarks:
-- name: TDC
-  link: https://tdcommons.ai/
-  split_column: split
 identifiers:
 - id: Antigen_sequence
   type: Other
diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py
index e9a0cd0df..ff48d9568 100644
--- a/data/iedb_jespersen_et_al/transform.py
+++ b/data/iedb_jespersen_et_al/transform.py
@@ -63,14 +63,14 @@ def get_active_position(seq, active_poisition, sequence_only=False):
     df.to_csv(fn_data_csv, index=False)
 
     meta = {
-        "name": "iedb_jespersen_et_al",  # unique identifier, we will also use this for directory names
+        "name": "iedb_jespersen_et_al", 
         "description": """Epitope prediction is to predict the active region in the antigen.
 This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
 epitopes and non-epitope amino acids determined from crystal structures.""",
         "targets": [
             {
                 "id": "active_position",  # name of the column in a tabular dataset
-                "description": "amino acids sequence position that is active in binding",  # description of what this column means
+                "description": "amino acids sequence position that is active in binding",
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
@@ -82,13 +82,6 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 ],
             }
         ],
-        "benchmarks": [
-            {
-            "name": "TDC",  # unique benchmark name
-            "link": "https://tdcommons.ai/",  # benchmark URL
-            "split_column": "split",  # name of the column that contains the split information
-            },
-        ],
         "identifiers": [
             {
                 "id": "Antigen_sequence",  # column name
@@ -99,7 +92,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 "fastq sequence",
                 "Protien sequence"
                 ],
-                "description": "amino acid sequence",  # description (optional, except for "Other")
+                "description": "amino acid sequence",  # d
             }
         ],
         "license": "CC BY 4.0",  # license under which the original dataset was published
diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml
index 4480baf47..48701c82d 100644
--- a/data/pdb_jespersen_et_al/meta.yaml
+++ b/data/pdb_jespersen_et_al/meta.yaml
@@ -13,10 +13,6 @@ targets:
   - Epitope
   uris:
   - https://rb.gy/l1st1c
-benchmarks:
-- name: TDC
-  link: https://tdcommons.ai/
-  split_column: split
 identifiers:
 - id: Antigen_sequence
   type: Other
diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py
index 3e21c2cad..7a4a77fb3 100644
--- a/data/pdb_jespersen_et_al/transform.py
+++ b/data/pdb_jespersen_et_al/transform.py
@@ -82,13 +82,6 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 ],
             }
         ],
-        "benchmarks": [
-        {
-            "name": "TDC",  # unique benchmark name
-            "link": "https://tdcommons.ai/",  # benchmark URL
-            "split_column": "split",  # name of the column that contains the split information
-        },
-        ],
         "identifiers": [
             {
                 "id": "Antigen_sequence",  # column name

From dd81fc743e36f4606d9e2ba92e66a7971595c769 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Wed, 19 Apr 2023 14:31:42 +0200
Subject: [PATCH 10/13] feat: iedb_jespersen_et_al clean up

---
 data/iedb_jespersen_et_al/meta.yaml    | 113 +++++++++++++------------
 data/iedb_jespersen_et_al/transform.py |  62 ++++++++++----
 2 files changed, 102 insertions(+), 73 deletions(-)

diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml
index 6d577b3ad..ca352405c 100644
--- a/data/iedb_jespersen_et_al/meta.yaml
+++ b/data/iedb_jespersen_et_al/meta.yaml
@@ -1,63 +1,66 @@
+---
 name: iedb_jespersen_et_al
 description: |-
-  Epitope prediction is to predict the active region in the antigen.
-  This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
-  epitopes and non-epitope amino acids determined from crystal structures.
+    Epitope prediction is to predict the active region in the antigen.
+    This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
+    epitopes and non-epitope amino acids determined from crystal structures.
 targets:
-- id: active_position
-  description: amino acids sequence position that is active in binding
-  units: ''
-  type: categorical
-  names:
-  - amino acids sequence active in binding
-  - Epitope
-  uris:
-  - https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189
+    - id: active_position
+      description: amino acids sequence position that is active in binding
+      units:
+      type: categorical
+      names:
+          - epitope
+          - amino acids sequence active in antigen binding
+          - epitope sequence active in antigen binding
+          - epitope sequence active in binding
+      uris:
+          - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
 identifiers:
-- id: Antigen_sequence
-  type: Other
-  names:
-  - amino acid sequence
-  - FASTQ
-  - fastq sequence
-  - Protien sequence
-  description: amino acid sequence
+    - id: Antigen_sequence
+      type: Other
+      names:
+          - amino acid sequence
+          - AA sequence
+          - epitope amino acid sequence
+          - epitope AA sequence
+      description: amino acid sequence
 license: CC BY 4.0
 links:
-- url: https://doi.org/10.1093/nar/gky1006
-  description: corresponding publication
-- url: https://doi.org/10.1093/nar/gkx346
-  description: corresponding publication
-- url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
-  description: data source
+    - url: https://doi.org/10.1093/nar/gky1006
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/gkx346
+      description: corresponding publication
+    - url: https://tdcommons.ai/single_pred_tasks/epitope/#iedb-jespersen-et-al
+      description: data source
 num_points: 3159
 bibtex:
-- |-
-  @article{Vita2018,
-  doi = {10.1093/nar/gky1006},
-  url = {https://doi.org/10.1093/nar/gky1006},
-  year = {2018},
-  month = oct,
-  publisher = {Oxford University Press (OUP)},
-  volume = {47},
-  number = {D1},
-  pages = {D339--D343}},
-  author = {Randi Vita and Swapnil Mahajan and James A Overton and
-  Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
-  Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
-  title = {The Immune Epitope Database (IEDB): 2018 update},
-  journal = {Nucleic Acids Research}
-- |-
-  @article{Jespersen2017,
-  doi = {10.1093/nar/gkx346},
-  url = {https://doi.org/10.1093/nar/gkx346},
-  year = {2017},
-  month = may,
-  publisher = {Oxford University Press (OUP)},
-  volume = {45},
-  number = {W1},
-  pages = {W24--W29},
-  author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-  title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
-  conformational epitopes},
-  journal = {Nucleic Acids Research}
+    - |-
+      @article{Vita2018,
+      doi = {10.1093/nar/gky1006},
+      url = {https://doi.org/10.1093/nar/gky1006},
+      year = {2018},
+      month = oct,
+      publisher = {Oxford University Press (OUP)},
+      volume = {47},
+      number = {D1},
+      pages = {D339--D343}},
+      author = {Randi Vita and Swapnil Mahajan and James A Overton and
+      Sandeep Kumar Dhanda and Sheridan Martini and Jason R Cantrell and
+      Daniel K Wheeler and Alessandro Sette and Bjoern Peters},
+      title = {The Immune Epitope Database (IEDB): 2018 update},
+      journal = {Nucleic Acids Research}
+    - |-
+      @article{Jespersen2017,
+      doi = {10.1093/nar/gkx346},
+      url = {https://doi.org/10.1093/nar/gkx346},
+      year = {2017},
+      month = may,
+      publisher = {Oxford University Press (OUP)},
+      volume = {45},
+      number = {W1},
+      pages = {W24--W29},
+      author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+      title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+      conformational epitopes},
+      journal = {Nucleic Acids Research}
diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py
index ff48d9568..fe545a66e 100644
--- a/data/iedb_jespersen_et_al/transform.py
+++ b/data/iedb_jespersen_et_al/transform.py
@@ -5,38 +5,61 @@
 
 def get_and_transform_data():
     # get raw data
-    target_folder = "IEDB_Jespersen_et_al"
     target_subfolder = "IEDB_Jespersen"
-    data = Epitope(name=target_subfolder)
+    splits = Epitope(name=target_subfolder).get_split()
+    df_train = splits["train"]
+    df_valid = splits["valid"]
+    df_test = splits["test"]
+    df_train["split"] = "train"
+    df_valid["split"] = "valid"
+    df_test["split"] = "test"
+    df = pd.concat([df_train, df_valid, df_test], axis=0)
 
-    def get_active_position(seq, active_poisition, sequence_only=False):
+    fn_data_raw = "data_raw.csv"
+    df.to_csv(fn_data_raw, index=False)
+    del df
+
+    def get_active_position(seq, active_position, sequence_only=False):
         """
         Input: given a sequence and list of active index
         Output: return active sequence and other sequence convert to _
         MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
         """
+        if isinstance(
+            active_position, str
+        ):  # if list is casted to string after loading from raw csv data file.
+            active_position = [int(x) for x in active_position[1:-1].split(", ")]
+
         if sequence_only:
-            _seq = "".join([seq[x] for x in active_poisition])
+            _seq = "".join([seq[x] for x in active_position])
             return _seq
+
         _seq = ["_" for a in range(len(seq))]
-        for x in active_poisition:
+        for x in active_position:
             _seq[x] = seq[x]
         _seq = "".join(_seq)
         return _seq
 
-    df = pd.read_pickle("data/iedb_jespersen.pkl")
+    # proceed raw data
+    df = pd.read_csv(fn_data_raw, sep=",")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ["ID", "X", "Y"]
+    assert fields_orig == ["Antigen_ID", "Antigen", "Y", "split"]
 
     # Rename columns of raw data
-    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
+    fields_clean = [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "split",
+    ]
     df.columns = fields_clean
 
     # get active position
     antigen_seq = df.Antigen_sequence.tolist()
     a_pos_ind_list = df.active_positions_indices.tolist()
     df["active_position"] = [
-        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+        get_active_position(x, o, sequence_only=True)
+        for x, o in zip(antigen_seq, a_pos_ind_list)
     ]
 
     # save data to original
@@ -48,6 +71,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
         "Antigen_ID",
         "Antigen_sequence",
         "active_positions_indices",
+        "split",
         "active_position",
     ]
 
@@ -63,7 +87,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
     df.to_csv(fn_data_csv, index=False)
 
     meta = {
-        "name": "iedb_jespersen_et_al", 
+        "name": "iedb_jespersen_et_al",
         "description": """Epitope prediction is to predict the active region in the antigen.
 This dataset is from Bepipred, which curates a dataset from IEDB. It collects B-cell
 epitopes and non-epitope amino acids determined from crystal structures.""",
@@ -71,14 +95,16 @@ def get_active_position(seq, active_poisition, sequence_only=False):
             {
                 "id": "active_position",  # name of the column in a tabular dataset
                 "description": "amino acids sequence position that is active in binding",
-                "units": "",  # units of the values in this column (leave empty if unitless)
+                "units": None,  # units of the values in this column (leave empty if unitless)
                 "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
-                    "amino acids sequence active in binding",
-                    "Epitope",
+                    "epitope",
+                    "amino acids sequence active in antigen binding",
+                    "epitope sequence active in antigen binding",
+                    "epitope sequence active in binding",
                 ],
                 "uris": [
-                    "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C13189",
+                    "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",
                 ],
             }
         ],
@@ -87,10 +113,10 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 "id": "Antigen_sequence",  # column name
                 "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                 "names": [
-                "amino acid sequence",
-                "FASTQ",
-                "fastq sequence",
-                "Protien sequence"
+                    "amino acid sequence",
+                    "AA sequence",
+                    "epitope amino acid sequence",
+                    "epitope AA sequence",
                 ],
                 "description": "amino acid sequence",  # d
             }

From 21f6ecf281c048cd25a7ec9626293c6d9e740114 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Wed, 19 Apr 2023 14:43:25 +0200
Subject: [PATCH 11/13] feat: pdb_jespersen_et_al clean up

---
 data/pdb_jespersen_et_al/meta.yaml    | 109 +++++++++++++-------------
 data/pdb_jespersen_et_al/transform.py |  62 ++++++++++-----
 2 files changed, 100 insertions(+), 71 deletions(-)

diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml
index 48701c82d..5fdcac996 100644
--- a/data/pdb_jespersen_et_al/meta.yaml
+++ b/data/pdb_jespersen_et_al/meta.yaml
@@ -1,61 +1,64 @@
+---
 name: pdb_jespersen_et_al
 description: |-
-  Epitope prediction is to predict the active region in the antigen.
-  This dataset is from Bepipred, which curates a dataset from PDB.
-  It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.
+    Epitope prediction is to predict the active region in the antigen.
+    This dataset is from Bepipred, which curates a dataset from PDB.
+    It collects B-cell epitopes and non-epitope amino acids determined from crystal structures.
 targets:
-- id: active_position
-  description: amino acids sequence position that is active in binding
-  units: ''
-  type: categorical
-  names:
-  - amino acids sequence active in binding
-  - Epitope
-  uris:
-  - https://rb.gy/l1st1c
+    - id: active_position
+      description: amino acids sequence position that is active in binding
+      units:
+      type: categorical
+      names:
+          - epitope
+          - amino acids sequence active in antigen binding
+          - epitope sequence active in antigen binding
+          - epitope sequence active in binding
+      uris:
+          - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
 identifiers:
-- id: Antigen_sequence
-  type: Other
-  names:
-  - amino acid sequence
-  - FASTQ
-  - fastq sequence
-  - Protien sequence
-  description: amino acid sequence
+    - id: Antigen_sequence
+      type: Other
+      names:
+          - amino acid sequence
+          - AA sequence
+          - epitope amino acid sequence
+          - epitope AA sequence
+      description: amino acid sequence
 license: CC BY 4.0
 links:
-- url: https://doi.org/10.1093/nar/gkx346
-  description: corresponding publication
-- url: https://doi.org/10.1093/nar/28.1.235
-  description: corresponding publication
-- url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
-  description: data source
+    - url: https://doi.org/10.1093/nar/gkx346
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/28.1.235
+      description: corresponding publication
+    - url: https://tdcommons.ai/single_pred_tasks/epitope/#pdb-jespersen-et-al
+      description: data source
 num_points: 447
 bibtex:
-- |-
-  @article{Jespersen2017,
-  doi = {10.1093/nar/gkx346},
-  url = {https://doi.org/10.1093/nar/gkx346},
-  year = {2017},
-  month = may,
-  publisher = {Oxford University Press (OUP)},
-  volume = {45},
-  number = {W1},
-  pages = {W24--W29},
-  author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
-  title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
-  conformational epitopes},
-  journal = {Nucleic Acids Research}
-- |-
-  @article{Berman2000,
-  doi = {10.1093/nar/28.1.235},
-  url = {https://doi.org/10.1093/nar/28.1.235},
-  year = {2000},
-  month = jan,
-  publisher = {Oxford University Press (OUP)},
-  volume = {28},
-  number = {1},
-  pages = {235--242},
-  author = {H. M. Berman},
-  title = {The Protein Data Bank},
-  journal = {Nucleic Acids Research}
+    - |-
+      @article{Jespersen2017,
+      doi = {10.1093/nar/gkx346},
+      url = {https://doi.org/10.1093/nar/gkx346},
+      year = {2017},
+      month = may,
+      publisher = {Oxford University Press (OUP)},
+      volume = {45},
+      number = {W1},
+      pages = {W24--W29},
+      author = {Martin Closter Jespersen and Bjoern Peters and Morten Nielsen and Paolo Marcatili},
+      title = {BepiPred 2.0: improving sequence-based B-cell epitope prediction using
+      conformational epitopes},
+      journal = {Nucleic Acids Research}
+    - |-
+      @article{Berman2000,
+      doi = {10.1093/nar/28.1.235},
+      url = {https://doi.org/10.1093/nar/28.1.235},
+      year = {2000},
+      month = jan,
+      publisher = {Oxford University Press (OUP)},
+      volume = {28},
+      number = {1},
+      pages = {235--242},
+      author = {H. M. Berman},
+      title = {The Protein Data Bank},
+      journal = {Nucleic Acids Research}
diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py
index 7a4a77fb3..fd9a38747 100644
--- a/data/pdb_jespersen_et_al/transform.py
+++ b/data/pdb_jespersen_et_al/transform.py
@@ -5,38 +5,61 @@
 
 def get_and_transform_data():
     # get raw data
-    target_folder = "PDB_Jespersen_et_al"
     target_subfolder = "PDB_Jespersen"
-    data = Epitope(name=target_subfolder)
+    splits = Epitope(name=target_subfolder).get_split()
+    df_train = splits["train"]
+    df_valid = splits["valid"]
+    df_test = splits["test"]
+    df_train["split"] = "train"
+    df_valid["split"] = "valid"
+    df_test["split"] = "test"
+    df = pd.concat([df_train, df_valid, df_test], axis=0)
 
-    def get_active_position(seq, active_poisition, sequence_only=False):
+    fn_data_raw = "data_raw.csv"
+    df.to_csv(fn_data_raw, index=False)
+    del df
+
+    def get_active_position(seq, active_position, sequence_only=False):
         """
         Input: given a sequence and list of active index
         Output: return active sequence and other sequence convert to _
         MASQKRPS ,[1,2,3,4,6] -> _ASQK_P_
         """
+        if isinstance(
+            active_position, str
+        ):  # if list is casted to string after loading from raw csv data file.
+            active_position = [int(x) for x in active_position[1:-1].split(", ")]
+
         if sequence_only:
-            _seq = "".join([seq[x] for x in active_poisition])
+            _seq = "".join([seq[x] for x in active_position])
             return _seq
+
         _seq = ["_" for a in range(len(seq))]
-        for x in active_poisition:
+        for x in active_position:
             _seq[x] = seq[x]
         _seq = "".join(_seq)
         return _seq
 
-    df = pd.read_pickle("data/pdb_jespersen.pkl")
+    # proceed raw data
+    df = pd.read_csv(fn_data_raw, sep=",")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ["ID", "X", "Y"]
+    assert fields_orig == ["Antigen_ID", "Antigen", "Y", "split"]
 
     # Rename columns of raw data
-    fields_clean = ["Antigen_ID", "Antigen_sequence", "active_positions_indices"]
+    fields_clean = [
+        "Antigen_ID",
+        "Antigen_sequence",
+        "active_positions_indices",
+        "split",
+    ]
     df.columns = fields_clean
 
     # get active position
     antigen_seq = df.Antigen_sequence.tolist()
     a_pos_ind_list = df.active_positions_indices.tolist()
     df["active_position"] = [
-        get_active_position(x, o) for x, o in zip(antigen_seq, a_pos_ind_list)
+        get_active_position(x, o, sequence_only=True)
+        for x, o in zip(antigen_seq, a_pos_ind_list)
     ]
 
     # save data to original
@@ -48,6 +71,7 @@ def get_active_position(seq, active_poisition, sequence_only=False):
         "Antigen_ID",
         "Antigen_sequence",
         "active_positions_indices",
+        "split",
         "active_position",
     ]
 
@@ -70,15 +94,17 @@ def get_active_position(seq, active_poisition, sequence_only=False):
         "targets": [
             {
                 "id": "active_position",  # name of the column in a tabular dataset
-                "description": "amino acids sequence position that is active in binding",  # description of what this column means
-                "units": "",  # units of the values in this column (leave empty if unitless)
+                "description": "amino acids sequence position that is active in binding",
+                "units": None,  # units of the values in this column (leave empty if unitless)
                 "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
-                    "amino acids sequence active in binding",
-                    "Epitope",
+                    "epitope",
+                    "amino acids sequence active in antigen binding",
+                    "epitope sequence active in antigen binding",
+                    "epitope sequence active in binding",
                 ],
                 "uris": [
-                    "https://rb.gy/l1st1c",
+                    "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",
                 ],
             }
         ],
@@ -87,10 +113,10 @@ def get_active_position(seq, active_poisition, sequence_only=False):
                 "id": "Antigen_sequence",  # column name
                 "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                 "names": [
-                "amino acid sequence",
-                "FASTQ",
-                "fastq sequence",
-                "Protien sequence"
+                    "amino acid sequence",
+                    "AA sequence",
+                    "epitope amino acid sequence",
+                    "epitope AA sequence",
                 ],
                 "description": "amino acid sequence",  # description (optional, except for "Other")
             }

From e47fc44e137065b792abe4528e7671ead9f107e3 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Thu, 27 Apr 2023 14:12:38 +0200
Subject: [PATCH 12/13] feat: update new names setup for iedb_jespersen_et_al

---
 data/iedb_jespersen_et_al/meta.yaml    | 8 ++++----
 data/iedb_jespersen_et_al/transform.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/data/iedb_jespersen_et_al/meta.yaml b/data/iedb_jespersen_et_al/meta.yaml
index ca352405c..72638e1fa 100644
--- a/data/iedb_jespersen_et_al/meta.yaml
+++ b/data/iedb_jespersen_et_al/meta.yaml
@@ -10,10 +10,10 @@ targets:
       units:
       type: categorical
       names:
-          - epitope
-          - amino acids sequence active in antigen binding
-          - epitope sequence active in antigen binding
-          - epitope sequence active in binding
+          - noun: epitope
+          - noun: amino acids sequence active in the antigen binding
+          - noun: epitope sequence active in the antigen binding
+          - noun: epitope sequence active in the binding
       uris:
           - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
 identifiers:
diff --git a/data/iedb_jespersen_et_al/transform.py b/data/iedb_jespersen_et_al/transform.py
index fe545a66e..22472ba74 100644
--- a/data/iedb_jespersen_et_al/transform.py
+++ b/data/iedb_jespersen_et_al/transform.py
@@ -98,10 +98,10 @@ def get_active_position(seq, active_position, sequence_only=False):
                 "units": None,  # units of the values in this column (leave empty if unitless)
                 "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
-                    "epitope",
-                    "amino acids sequence active in antigen binding",
-                    "epitope sequence active in antigen binding",
-                    "epitope sequence active in binding",
+                    {"noun": "epitope"},
+                    {"noun": "amino acids sequence active in the antigen binding"},
+                    {"noun": "epitope sequence active in the antigen binding"},
+                    {"noun": "epitope sequence active in the binding"},
                 ],
                 "uris": [
                     "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",

From 714ee93a1acc419e0913f498f6dbb1d31b7a11a2 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Thu, 27 Apr 2023 14:13:35 +0200
Subject: [PATCH 13/13] feat: update new names setup for pdb_jespersen_et_al

---
 data/pdb_jespersen_et_al/meta.yaml    | 8 ++++----
 data/pdb_jespersen_et_al/transform.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/data/pdb_jespersen_et_al/meta.yaml b/data/pdb_jespersen_et_al/meta.yaml
index 5fdcac996..61df1008b 100644
--- a/data/pdb_jespersen_et_al/meta.yaml
+++ b/data/pdb_jespersen_et_al/meta.yaml
@@ -10,10 +10,10 @@ targets:
       units:
       type: categorical
       names:
-          - epitope
-          - amino acids sequence active in antigen binding
-          - epitope sequence active in antigen binding
-          - epitope sequence active in binding
+          - noun: epitope
+          - noun: amino acids sequence active in antigen binding
+          - noun: epitope sequence active in antigen binding
+          - noun: epitope sequence active in binding
       uris:
           - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189
 identifiers:
diff --git a/data/pdb_jespersen_et_al/transform.py b/data/pdb_jespersen_et_al/transform.py
index fd9a38747..c399f34cd 100644
--- a/data/pdb_jespersen_et_al/transform.py
+++ b/data/pdb_jespersen_et_al/transform.py
@@ -98,10 +98,10 @@ def get_active_position(seq, active_position, sequence_only=False):
                 "units": None,  # units of the values in this column (leave empty if unitless)
                 "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts)
-                    "epitope",
-                    "amino acids sequence active in antigen binding",
-                    "epitope sequence active in antigen binding",
-                    "epitope sequence active in binding",
+                    {"noun": "epitope"},
+                    {"noun": "amino acids sequence active in antigen binding"},
+                    {"noun": "epitope sequence active in antigen binding"},
+                    {"noun": "epitope sequence active in binding"},
                 ],
                 "uris": [
                     "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13189",