diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml new file mode 100644 index 000000000..5166a66e7 --- /dev/null +++ b/data/sabdab_chen/meta.yaml @@ -0,0 +1,82 @@ +--- +name: sabdab_chen +description: |- + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived + from BIOVIA's pipelines. +targets: + - id: developability + description: functional antibody candidate to be developed into a manufacturable one (1) or not (0) + units: + type: boolean + names: + - noun: functional antibody candidate to be developed into a manufacturable one + - noun: manufacturable and functional antibody candidate + uris: +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: + - id: antibody_pdb_ID + type: Other + names: + - pdb id + - Protein Data Bank id + description: anitbody pdb id + - id: heavy_chain + type: Other + names: + - amino acid sequence + - heavy chain amino acid sequence + - heavy chain AA sequence + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + names: + - amino acid sequence + - light chain amino acid sequence + - light chain AA sequence + description: anitbody light chain amino acid sequence in FASTA +license: CC BY 4.0 +links: + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source +num_points: 2409 +bibtex: + - |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} + - |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py new file mode 100644 index 000000000..044487609 --- /dev/null +++ b/data/sabdab_chen/transform.py @@ -0,0 +1,188 @@ +import pandas as pd +import yaml +from tdc.single_pred import Develop + + +def get_and_transform_data(): + # get raw data + target_subfolder = "SAbDab_Chen" + splits = Develop(name=target_subfolder).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) + + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df + + # proceed raw data + df = pd.read_csv(fn_data_raw, sep=",") + + fields_orig = df.columns.tolist() + assert fields_orig == ["Antibody_ID", "Antibody", "Y", "split"] + + fn_data_original = "data_original.csv" + + antibody_list = df.Antibody.tolist() + + def s2l(list_string): + return list(map(str.strip, list_string.strip("][").replace("'", "").split(","))) + + df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] + df["light_chain"] = [s2l(x)[1] for x in antibody_list] + df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"]] + df.to_csv(fn_data_original, index=False) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"] + fields_clean = [ + "antibody_pdb_ID", + "heavy_chain", + "light_chain", + "developability", + "split", + ] + df.columns = fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "sabdab_chen", # unique identifier, we will also use this for directory names + "description": """Antibody data from Chen et al, where they process from the SAbDab. +From an initial dataset of 3816 antibodies, they retained 2426 antibodies that +satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data +Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, +and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived +from BIOVIA's pipelines.""", + "targets": [ + { + "id": "developability", # name of the column in a tabular dataset + "description": "functional antibody candidate to be developed into a manufacturable one (1) or not (0)", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "boolean", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + { + "noun": "functional antibody candidate to be developed into a manufacturable one" + }, + {"noun": "manufacturable and functional antibody candidate"}, + ], + "uris": None, + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "antibody_pdb_ID", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "pdb id", + "Protein Data Bank id", + ], + "description": "anitbody pdb id", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "heavy chain amino acid sequence", + "heavy chain AA sequence", + ], + "description": "anitbody heavy chain amino acid sequence in FASTA", + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "light chain amino acid sequence", + "light chain AA sequence", + ], + "description": "anitbody light chain amino acid sequence in FASTA", + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1101/2020.06.18.159798", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gkt1043", + "description": "corresponding publication", + }, + { + "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/", + "description": "corresponding tools used", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Chen2020, +doi = {10.1101/2020.06.18.159798}, +url = {https://doi.org/10.1101/2020.06.18.159798}, +year = {2020}, +month = jun, +publisher = {Cold Spring Harbor Laboratory}, +author = {Xingyao Chen and Thomas Dougherty and +Chan Hong and Rachel Schibler and Yi Cong Zhao and +Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, +title = {Predicting Antibody Developability from Sequence +using Machine Learning}""", + """@article{Dunbar2013, +doi = {10.1093/nar/gkt1043}, +url = {https://doi.org/10.1093/nar/gkt1043}, +year = {2013}, +month = nov, +publisher = {Oxford University Press ({OUP})}, +volume = {42}, +number = {D1}, +pages = {D1140--D1146}, +author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem +and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and +Charlotte M. Deane}, +title = {SAbDab: the structural antibody database}, +journal = {Nucleic Acids Research}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml new file mode 100644 index 000000000..cb3cc5d2c --- /dev/null +++ b/data/tap/meta.yaml @@ -0,0 +1,117 @@ +--- +name: tap +description: |- + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. +targets: + - id: CDR_Length + description: complementarity-determining regions (CDR) length + units: amino acids + type: continuous + names: + - noun: antibody complementarity-determining regions length + - noun: antibody complementarity-determining regions (CDR) length + - noun: antibody CDR length + - noun: complementarity-determining regions (CDR) length + - noun: complementarity-determining regions length + - noun: CDR length + uris: + - id: PSH + description: patches of surface hydrophobicity (PSH) score + units: + type: continuous + names: + - antibody patches of surface hydrophobicity (PSH) score + - antibody patches of surface hydrophobicity score + - antibody PSH score + - patches of surface hydrophobicity (PSH) score + - patches of surface hydrophobicity score + - PSH score + uris: + - id: PPC + description: patches of positive charge (PPC) score + units: + type: continuous + names: + - antibody patches of positive charge (PPC) score + - antibody patches of positive charge score + - antibody PPC score + - patches of positive charge (PPC) score + - patches of positive charge score + - PPC score + uris: + - id: PNC + description: patches of negative charge (PNC) score + units: + type: continuous + names: + - antibody patches of negative charge (PNC) score + - antibody patches of negative charge score + - antibody PNC score + - patches of negative charge (PNC) score + - patches of negative charge score + - PNC score + uris: + - id: SFvCSP + description: structural Fv charge symmetry parameter (SFvCSP) score + units: + type: continuous + names: + - antibody structural Fv charge symmetry parameter (SFvCSP) score + - antibody structural Fv charge symmetry parameter score + - antibody SFvCSP score + - structural Fv charge symmetry parameter (SFvCSP) score + - structural Fv charge symmetry parameter score + - SFvCSP score + uris: +identifiers: + - id: antibody_name + type: Other + names: + - antibody name + - name of the antibody + - name of the antibody drug + description: antibody name + - id: heavy_chain + type: Other + names: + - amino acid sequence + - heavy chain amino acid sequence + - heavy chain AA sequence + description: antibody heavy chain amino acid sequence + - id: light_chain + type: Other + names: + - amino acid sequence + - light chain amino acid sequence + - light chain AA sequence + description: antibody light chain amino acid sequence +license: CC BY 4.0 +links: + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source +num_points: 241 +bibtex: + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py new file mode 100644 index 000000000..bcc9feabe --- /dev/null +++ b/data/tap/transform.py @@ -0,0 +1,292 @@ +import pandas as pd +import yaml +from tdc.single_pred import Develop +from tdc.utils import retrieve_label_name_list + + +def get_and_transform_data(): + # get raw data + target_subfolder = "TAP" + label_list = retrieve_label_name_list(target_subfolder) + df = pd.DataFrame() + for i, label in enumerate(label_list): + print(f"Get data subset {label}:") + splits = Develop(name=target_subfolder, label_name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df_cat = pd.concat([df_train, df_valid, df_test], axis=0) + assert df_cat.columns.tolist() == ["Antibody_ID", "Antibody", "Y", "split"] + df_cat.columns = ["Antibody_ID", "Antibody", label, "split"] + if i > 0: + df = pd.merge(df, df_cat, on=["Antibody_ID", "Antibody", "split"]) + else: + df = df_cat + + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df + + # proceed raw data + df = pd.read_csv(fn_data_raw, sep=",") + fields_orig = df.columns.tolist() + + assert fields_orig == [ + "Antibody_ID", + "Antibody", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + fields_clean = [ + "antibody_name", + "antibody_sequences", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + df.columns = fields_clean + # convert list columns to two columns + antibody_list = df.antibody_sequences.tolist() + + def s2l(list_string): + return list(map(str.strip, list_string.strip("][").replace("'", "").split(","))) + + def antibody2list(list_string): + return [x.strip() for x in s2l(list_string)[0].split("\\n")] + + df["heavy_chain"] = [antibody2list(x)[0] for x in antibody_list] + df["light_chain"] = [antibody2list(x)[1] for x in antibody_list] + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == [ + "antibody_name", + "antibody_sequences", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + "heavy_chain", + "light_chain", + ] + + df = df[ + [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + ] + fields_clean = [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "tap", # unique identifier, we will also use this for directory names + "description": """Immunogenicity, instability, self-association, +high viscosity, polyspecificity, or poor expression can all preclude +an antibody from becoming a therapeutic. Early identification of these +negative characteristics is essential. Akin to the Lipinski guidelines, +which measure druglikeness in small molecules, +Therapeutic Antibody Profiler (TAP) highlights antibodies +that possess characteristics that are rare/unseen in +clinical-stage mAb therapeutics.""", + "targets": [ + { + "id": "CDR_Length", # name of the column in a tabular dataset + "description": "complementarity-determining regions (CDR) length", + "units": "amino acids", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + {"noun": "antibody complementarity-determining regions length"}, + { + "noun": "antibody complementarity-determining regions (CDR) length" + }, + {"noun": "antibody CDR length"}, + {"noun": "complementarity-determining regions (CDR) length"}, + {"noun": "complementarity-determining regions length"}, + {"noun": "CDR length"}, + ], + "uris": None, + }, + { + "id": "PSH", # name of the column in a tabular dataset + "description": "patches of surface hydrophobicity (PSH) score", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody patches of surface hydrophobicity (PSH) score", + "antibody patches of surface hydrophobicity score", + "antibody PSH score", + "patches of surface hydrophobicity (PSH) score", + "patches of surface hydrophobicity score", + "PSH score", + ], + "uris": None, + }, + { + "id": "PPC", # name of the column in a tabular dataset + "description": "patches of positive charge (PPC) score", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody patches of positive charge (PPC) score", + "antibody patches of positive charge score", + "antibody PPC score", + "patches of positive charge (PPC) score", + "patches of positive charge score", + "PPC score", + ], + "uris": None, + }, + { + "id": "PNC", # name of the column in a tabular dataset + "description": "patches of negative charge (PNC) score", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody patches of negative charge (PNC) score", + "antibody patches of negative charge score", + "antibody PNC score", + "patches of negative charge (PNC) score", + "patches of negative charge score", + "PNC score", + ], + "uris": None, + }, + { + "id": "SFvCSP", # name of the column in a tabular dataset + "description": "structural Fv charge symmetry parameter (SFvCSP) score", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody structural Fv charge symmetry parameter (SFvCSP) score", + "antibody structural Fv charge symmetry parameter score", + "antibody SFvCSP score", + "structural Fv charge symmetry parameter (SFvCSP) score", + "structural Fv charge symmetry parameter score", + "SFvCSP score", + ], + "uris": None, + }, + ], + "identifiers": [ + { + "id": "antibody_name", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "antibody name", + "name of the antibody", + "name of the antibody drug", + ], + "description": "antibody name", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "heavy chain amino acid sequence", + "heavy chain AA sequence", + ], + "description": "antibody heavy chain amino acid sequence", # description (optional, except for "Other") + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names": [ + "amino acid sequence", + "light chain amino acid sequence", + "light chain AA sequence", + ], + "description": "antibody light chain amino acid sequence", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1073/pnas.1810576116", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#tap", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Raybould2019, +doi = {10.1073/pnas.1810576116}, +url = {https://doi.org/10.1073/pnas.1810576116}, +year = {2019}, +month = feb, +publisher = {Proceedings of the National Academy of Sciences}, +volume = {116}, +number = {10}, +pages = {4025--4030}, +author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk +and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek +and Jiye Shi and Charlotte M. Deane}, +title = {Five computational developability guidelines for therapeutic antibody profiling}, +journal = {Proceedings of the National Academy of Sciences}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data()